import scrapy from scrapy import Request class AsphaltImportedSpider(scrapy.Spider): name = "asphalt_imported" start_urls = [ ('沥青', "http://www.baiinfo.com/news/newscategory/17847/18/1"), ('沥青', "http://www.baiinfo.com/news/newscategory/17847/18/2"), ('沥青', "http://www.baiinfo.com/news/newscategory/17847/18/3"), ('沥青', "http://www.baiinfo.com/news/newscategory/17847/18/4"), ('沥青', "http://www.baiinfo.com/news/newscategory/17847/18/5"), ] cookie = None user_agent = None def start_requests(self): for source, url in self.start_urls: yield Request( method='GET', url=url, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', }, cookies=self.cookie, meta={'source': source, 'type': 'home'} ) def parse(self, response, **kwargs): if response.meta['type'] == 'home': yield from self.parse_home(response) elif response.meta['type'] == 'list_page': yield from self.parse_list_page(response) def parse_home(self, response): for item in response.xpath('//*[@id="__nuxt"]/div/div[5]/div/div[2]/div[1]/div[1]/div[2]/ul/li/a'): name = item.xpath('text()').get() if '散装进口沥青到岸价' in name: print(name, 'http://www.baiinfo.com{}'.format(item.xpath('@href').get())) yield Request( method='GET', url='http://www.baiinfo.com{}'.format(item.xpath('@href').get()), headers={ 'User-Agent': self.user_agent, }, cookies=self.cookie, meta={'source': name, 'type': 'list_page'} ) def parse_list_page(self, response): date = response.xpath('//*[@id="__nuxt"]/div/div[5]/div/div[2]/div[1]/div[2]/p[1]/span[4]/text()').get() date = date.strip('日').replace('年', '-').replace('月', '-') for item in response.xpath('//*[@id="__nuxt"]/div/div[5]/div/div[2]/div[1]/div[3]/div/div/table/tbody/tr'): line = [cell.xpath('text()').get() for cell in item.xpath('td/span/span/span/span/span')][:7] print(line) if line[-1] == '备注' or '品质' in line[0]: continue name, *_, price, fluctuating = line yield { 'name': name, 'date': date, 'price': int(price.split('-')[-1]), } if __name__ == '__main__': import json from spiders import run_spider, BaiinfoCookieTools from commons.models.asphalt_imported import AsphaltImported from core.factory import ClientApp # cookie 读取 cookie = BaiinfoCookieTools.get_cookies() # 爬取 AsphaltImportedSpider.cookie = cookie AsphaltImportedSpider.user_agent = BaiinfoCookieTools.user_agent file_path = run_spider(AsphaltImportedSpider) # 入库 data = json.loads(open(file_path, 'r', encoding='utf-8').read()) with ClientApp().app_context(): for item in data: print(item) AsphaltImported(**item).upsert()