import scrapy from scrapy import Request class CementSpider(scrapy.Spider): name = "cement" start_urls = [ ('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/1"), ('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/2"), ('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/3"), ('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/4"), ('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/5"), ('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/6"), ('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/7"), ('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/8"), ('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/9"), ('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/10"), ] cookie = None user_agent = None def start_requests(self): for source, url in self.start_urls: yield Request( method='GET', url=url, headers={ 'User-Agent': self.user_agent, }, cookies=self.cookie, meta={'source': source, 'type': 'home'} ) def parse(self, response, **kwargs): if response.meta['type'] == 'home': yield from self.parse_home(response) elif response.meta['type'] == 'list_page': yield from self.parse_list_page(response) def parse_home(self, response): for item in response.xpath('//*[@id="__nuxt"]/div/div[5]/div/div[2]/div[1]/div[1]/div[2]/ul/li/a'): title = item.xpath('text()').get() if '福建水泥市场参考价格' in title: print(title, 'http://www.baiinfo.com{}'.format(item.xpath('@href').get())) yield Request( method='GET', url='http://www.baiinfo.com{}'.format(item.xpath('@href').get()), headers={ 'User-Agent': self.user_agent, }, cookies=self.cookie, meta={'source': title, 'type': 'list_page'} ) def parse_list_page(self, response): date = response.xpath('//*[@id="__nuxt"]/div/div[5]/div/div[2]/div[1]/div[2]/p[1]/span[4]/text()').get() date = date.strip('日').replace('年', '-').replace('月', '-') for item in response.xpath('//tr[position()>2]'): block_1 = [cell.get() for cell in item.xpath('td/span/span/span/span/span/text()') if cell.get()] price, *_ = block_1 block_2 = [cell.get() for cell in item.xpath('td/span/span/span/span/span/span/text()') if cell.get()] spec, name, pack, source, _, fluctuating = block_2 yield { 'name': name, 'price': price, 'spec': spec, 'pack': pack, 'date': date, 'source': source, 'fluctuating': int(fluctuating) } if __name__ == '__main__': import json from spiders import run_spider, BaiinfoCookieTools from commons.models.cement import Cement from core.factory import ClientApp # cookie 读取 cookie = BaiinfoCookieTools.get_cookies() # 爬取 CementSpider.cookie = cookie CementSpider.user_agent = BaiinfoCookieTools.user_agent file_path = run_spider(CementSpider) # 入库 data = json.loads(open(file_path, 'r', encoding='utf-8').read()) with ClientApp().app_context(): for item in data: print(item) Cement(**item).upsert()