84 lines
		
	
	
		
			3.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			84 lines
		
	
	
		
			3.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import json
 | |
| 
 | |
| import scrapy
 | |
| from scrapy import Request
 | |
| 
 | |
| 
 | |
| class OilSpider(scrapy.Spider):
 | |
|     name = "oil"
 | |
|     start_urls = [
 | |
|         ('成品油价格调整', "https://fgw.fujian.gov.cn/was5/web/search?channelid=217025&templet=advsch.jsp&sortfield=-docreltime&classsql=%25%E6%88%90%E5%93%81%E6%B2%B9%E4%BB%B7%E6%A0%BC%E8%B0%83%E6%95%B4%25*siteid%3D31*siteid%3D31&prepage=100&page=1"),
 | |
|     ]
 | |
| 
 | |
|     def start_requests(self):
 | |
|         for source, url in self.start_urls:
 | |
|             yield Request(
 | |
|                 method='GET',
 | |
|                 url=url,
 | |
|                 headers={
 | |
|                     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
 | |
|                 },
 | |
|                 meta={'source': source, 'type': 'list'}
 | |
|             )
 | |
| 
 | |
|     def parse(self, response, **kwargs):
 | |
|         if response.meta['type'] == 'list':
 | |
|             yield from self.parse_list(response)
 | |
|         elif response.meta['type'] == 'detail':
 | |
|             yield from self.parse_detail(response)
 | |
| 
 | |
|     def parse_list(self, response):
 | |
|         ret = json.loads(response.text.replace('\n', ''))
 | |
|         if ret.get('count') and ret.get('docs'):
 | |
|             for item in ret['docs']:
 | |
|                 if not item.get('title2'):
 | |
|                     continue
 | |
|                 print(f"{item['title2']} {item['pubtime']}")
 | |
|                 yield Request(
 | |
|                     method='GET',
 | |
|                     url=item['url'],
 | |
|                     meta={'source': f"{item['title2']} {item['pubtime']}", 'type': 'detail', 'time': item['pubtime']}
 | |
|                 )
 | |
| 
 | |
|     def parse_detail(self, response):
 | |
|         for item in response.xpath('//table[1]/tbody/tr'):
 | |
|             if len([i.get() for i in item.xpath('td/span/text()')]) > 0:
 | |
|                 first_word = item.xpath('td/span/text()').get()
 | |
|                 print()
 | |
|                 if first_word.strip() == '油品' or first_word.strip() == '元/吨':
 | |
|                     continue
 | |
|                 name = first_word
 | |
|                 price, *_ = [i.get() for i in item.xpath('td/text()')]
 | |
|                 yield {
 | |
|                     'name': name,
 | |
|                     'price': int(price),
 | |
|                     'date': response.meta['time'].split(' ')[0],
 | |
|                 }
 | |
|             elif len([i.get() for i in item.xpath('td/text()')]) > 0:
 | |
|                 first_word = item.xpath('td/text()').get()
 | |
|                 if first_word.strip() == '油品' or first_word.strip() == '元/吨' or first_word.startswith('\xa0') or first_word.startswith('\n'):
 | |
|                     continue
 | |
|                 name, price, *_ = [i.get() for i in item.xpath('td/text()')]
 | |
|                 yield {
 | |
|                     'name': name,
 | |
|                     'price': price,
 | |
|                     'date': response.meta['time'].split(' ')[0],
 | |
|                 }
 | |
|             else:
 | |
|                 print()
 | |
| 
 | |
| 
 | |
| if __name__ == '__main__':
 | |
|     from spiders import run_spider
 | |
|     from commons.models.oil import Oil
 | |
|     from core.factory import ClientApp
 | |
| 
 | |
|     # 爬取
 | |
|     file_path = run_spider(OilSpider)
 | |
|     # 入库
 | |
|     data = json.loads(open(file_path, 'r', encoding='utf-8').read())
 | |
|     with ClientApp().app_context():
 | |
|         for item in data:
 | |
|             print(item)
 | |
|             Oil(**item).upsert()
 | 
