import scrapy from scrapy import Request from price.constants.mysteel import PageType class SteelRebarSpider(scrapy.Spider): name = "steel_rebar" start_urls = [ (PageType.REBAR_LIST, "https://jiancai.mysteel.com/market/pa228aa010101a0a01010502aaaa1.html"), ] cookie = None user_agent = None def start_requests(self): for source, url in self.start_urls: yield Request( method='GET', url=url, headers={ 'User-Agent': self.user_agent, }, meta={'source': source, 'type': source} ) def parse(self, response, **kwargs): if response.meta['type'] == PageType.REBAR_LIST: yield from self.parse_steel_list(response) elif response.meta['type'] == PageType.REBAR_DETAIL: yield from self.parse_steel_detail(response) def parse_steel_list(self, response): for item in response.xpath('//*[@id="articleList"]/ul/li/a'): uri = item.xpath('@href').get() name = item.xpath('text()').get() print(uri, name) if ')福州市场建筑钢材价格行情' not in name: continue yield Request( method='GET', url=uri, headers={ 'User-Agent': self.user_agent, }, cookies=self.cookie, meta={'source': name, 'type': PageType.REBAR_DETAIL} ) @staticmethod def parse_steel_detail(response): # 解析日期 title = response.xpath('//*[@id="content-title"]/text()').get() date = title.split('日')[0].replace('年', '-').replace('月', '-') # for item in response.xpath('//*[@id="marketTable"]/tr[position()>2]'): line = [cell.xpath('text()').get().strip() for cell in item.xpath('td')] print(line) if len(line) < 8: continue name, spec, material, source, price, fluctuating, *_ = line yield { 'name': name, 'spec': spec, 'material': material, 'source': source, 'price': int(price), 'fluctuating': 0 if fluctuating == '-' else int(fluctuating), 'date': date } if __name__ == '__main__': import json from spiders import run_spider, MysteelCookieTools from commons.models.steel_rebar import SteelRebar from core.factory import ClientApp # cookie 读取 cookie = MysteelCookieTools.get_cookies() # 爬取 SteelRebarSpider.cookie = cookie SteelRebarSpider.user_agent = MysteelCookieTools.user_agent file_path = run_spider(SteelRebarSpider) # 入库 data = json.loads(open(file_path, 'r', encoding='utf-8').read()) with ClientApp().app_context(): for item in data: print(item) SteelRebar(**item).upsert()