import json import scrapy from scrapy import Request class OilSpider(scrapy.Spider): name = "oil" start_urls = [ ('成品油价格调整', "https://fgw.fujian.gov.cn/was5/web/search?channelid=217025&templet=advsch.jsp&sortfield=-docreltime&classsql=%25%E6%88%90%E5%93%81%E6%B2%B9%E4%BB%B7%E6%A0%BC%E8%B0%83%E6%95%B4%25*siteid%3D31*siteid%3D31&prepage=100&page=1"), ] def start_requests(self): for source, url in self.start_urls: yield Request( method='GET', url=url, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', }, meta={'source': source, 'type': 'list'} ) def parse(self, response, **kwargs): if response.meta['type'] == 'list': yield from self.parse_list(response) elif response.meta['type'] == 'detail': yield from self.parse_detail(response) def parse_list(self, response): ret = json.loads(response.text.replace('\n', '')) if ret.get('count') and ret.get('docs'): for item in ret['docs']: if not item.get('title2'): continue print(f"{item['title2']} {item['pubtime']}") yield Request( method='GET', url=item['url'], meta={'source': f"{item['title2']} {item['pubtime']}", 'type': 'detail', 'time': item['pubtime']} ) def parse_detail(self, response): for item in response.xpath('//table[1]/tbody/tr'): if len([i.get() for i in item.xpath('td/span/text()')]) > 0: first_word = item.xpath('td/span/text()').get() print() if first_word.strip() == '油品' or first_word.strip() == '元/吨': continue name = first_word price, *_ = [i.get() for i in item.xpath('td/text()')] yield { 'name': name, 'price': int(price), 'date': response.meta['time'].split(' ')[0], } elif len([i.get() for i in item.xpath('td/text()')]) > 0: first_word = item.xpath('td/text()').get() if first_word.strip() == '油品' or first_word.strip() == '元/吨' or first_word.startswith('\xa0') or first_word.startswith('\n'): continue name, price, *_ = [i.get() for i in item.xpath('td/text()')] yield { 'name': name, 'price': price, 'date': response.meta['time'].split(' ')[0], } else: print() if __name__ == '__main__': from spiders import run_spider from commons.models.oil import Oil from core.factory import ClientApp # 爬取 file_path = run_spider(OilSpider) # 入库 data = json.loads(open(file_path, 'r', encoding='utf-8').read()) with ClientApp().app_context(): for item in data: print(item) Oil(**item).upsert()