import scrapy from scrapy import Request from commons.constants.mysteel import PageType class SteelSectionSpider(scrapy.Spider): name = "steel_section" start_urls = [ (PageType.SECTION_LIST, "https://list1.mysteel.com/market/p-227-----010107-0-01010502-------1.html"), ] cookie = None user_agent = None def start_requests(self): for source, url in self.start_urls: yield Request( method='GET', url=url, headers={ 'User-Agent': self.user_agent, }, meta={'source': source, 'type': source} ) def parse(self, response, **kwargs): if response.meta['type'] == PageType.SECTION_LIST: yield from self.parse_section_list(response) if response.meta['type'] == PageType.SECTION_DETAIL: yield from self.parse_section_detail(response) def parse_section_list(self, response): for item in response.xpath('//*[@id="articleList"]/ul/li/a'): uri = item.xpath('@href').get() name = item.xpath('text()').get() print(uri, name) if '福州市场工角槽钢价格行情' not in name: continue yield Request( method='GET', url=uri, headers={ 'User-Agent': self.user_agent, }, cookies=self.cookie, meta={'source': name, 'type': PageType.SECTION_DETAIL} ) def parse_section_detail(self, response): title = response.xpath('//*[@id="content-title"]/text()').get() date = title.split('日')[0].replace('年', '-').replace('月', '-') for item in response.xpath('//*[@id="marketTable"]/tr[position()>2]'): line = [cell.xpath('text()').get().strip() for cell in item.xpath('td')] print(line) if len(line) < 7: continue name, spec, material, source, price, fluctuating, *_ = line yield { 'name': name, 'spec': spec, 'material': material, 'source': source, 'price': int(price), 'fluctuating': 0 if fluctuating == '-' else int(fluctuating), 'date': date } if __name__ == '__main__': import json from spiders import run_spider, MysteelCookieTools from commons.models.steel_section import SteelSection from core.factory import ClientApp # cookie 读取 cookie = MysteelCookieTools.get_cookies() # 爬取 SteelSectionSpider.cookie = cookie SteelSectionSpider.user_agent = MysteelCookieTools.user_agent file_path = run_spider(SteelSectionSpider) # 入库 data = json.loads(open(file_path, 'r', encoding='utf-8').read()) with ClientApp().app_context(): for item in data: print(item) SteelSection(**item).upsert()