import scrapy from scrapy import Request from commons.constants.mysteel import PageType class SteelPlateSpider(scrapy.Spider): name = "steel_plate" start_urls = [ (PageType.PLATE_LIST, "https://list1.mysteel.com/market/p-219-----010102-0-01010502-------1.html"), ] cookie = None user_agent = None def start_requests(self): for source, url in self.start_urls: yield Request( method='GET', url=url, headers={ 'User-Agent': self.user_agent, }, meta={'source': source, 'type': source} ) def parse(self, response, **kwargs): if response.meta['type'] == PageType.PLATE_LIST: yield from self.parse_board_list(response) if response.meta['type'] == PageType.PLATE_DETAIL: yield from self.parse_board_detail(response) def parse_board_list(self, response): for item in response.xpath('//*[@id="articleList"]/ul/li/a'): uri = item.xpath('@href').get() name = item.xpath('text()').get() print(uri, name) if '福州市场中厚板价格行情' not in name: continue yield Request( method='GET', url=uri, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', }, cookies=self.cookie, meta={'source': name, 'type': PageType.PLATE_DETAIL} ) def parse_board_detail(self, response): title = response.xpath('//*[@id="content-title"]/text()').get() date = title.split('日')[0].replace('年', '-').replace('月', '-') for item in response.xpath('//*[@id="marketTable"]/tr[position()>2]'): line = [cell.xpath('text()').get().strip() for cell in item.xpath('td')] print(line) if len(line) < 7: continue name, spec, material, source, price, fluctuating, *_ = line yield { 'name': name, 'spec': spec, 'material': material, 'source': source, 'price': int(price), 'fluctuating': 0 if fluctuating == '-' else int(fluctuating), 'date': date } if __name__ == '__main__': import json from spiders import run_spider, MysteelCookieTools from commons.models.steel_plate import SteelPlate from core.factory import ClientApp # cookie 读取 cookie = MysteelCookieTools.get_cookies() # 爬取 SteelPlateSpider.cookie = cookie SteelPlateSpider.user_agent = MysteelCookieTools.user_agent file_path = run_spider(SteelPlateSpider) # 入库 data = json.loads(open(file_path, 'r', encoding='utf-8').read()) with ClientApp().app_context(): for item in data: print(item) SteelPlate(**item).upsert()