import scrapy from lxml import html, etree from scrapy import Request class DataZhejiangSpider(scrapy.Spider): name = "data_zhejiang" start_urls = [ ('材料价格', "http://jtyst.zj.gov.cn/col/col1228999576/index.html"), ('材料价格', "http://jtyst.zj.gov.cn/col/col1228999576/index.html?uid=5509220&pageNum=2"), ('材料价格', "http://jtyst.zj.gov.cn/col/col1228999576/index.html?uid=5509220&pageNum=3"), ('材料价格', "http://jtyst.zj.gov.cn/col/col1228999576/index.html?uid=5509220&pageNum=4"), ('材料价格', "http://jtyst.zj.gov.cn/col/col1228999576/index.html?uid=5509220&pageNum=5"), ] def start_requests(self): for source, url in self.start_urls: yield Request( method='GET', url=url, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', }, meta={'source': source, 'type': 'home'} ) def parse(self, response, **kwargs): if response.meta['type'] == 'home': yield from self.parse_home(response) elif response.meta['type'] == 'list': yield from self.parse_list(response) def parse_home(self, response): rsp = response.xpath('//*[@id="5509220"]/script/text()').get() for t in ('', '', '', '', '', '', '', '', ''): rsp = rsp.replace(t, '') html = etree.HTML(rsp) for item in html.xpath('//li/a'): print(item) uri = item.xpath('@href')[0] name = item.xpath('text()')[0] print(uri, name) if '《质监与造价》价格信息专辑' not in name: continue yield Request( method='GET', url=f'http://jtyst.zj.gov.cn{uri}', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', }, meta={'source': name, 'type': 'list'} ) def parse_list(self, response): date = response.xpath('/html/body/div[2]/div[1]/div[2]/div[1]/p/span[1]/text()').get().split('日')[0].replace('年', '-').replace('月', '-') source = response.xpath('/html/body/div[2]/div[1]/div[2]/div[1]/p/span[2]/text()[2]').get() for item in response.xpath('//*[@id="zoom"]/p/a'): uri = item.xpath('@href').get() name = item.xpath('text()').get() print(uri, name) yield { 'url': f'https://jtyst.zj.gov.cn{uri}', 'name': name, # 'source': response.meta['source'] 'source': source, 'date': date, } if __name__ == '__main__': import json from spiders import run_spider from commons.models.data_zhejiang import DataZhejiang from core.factory import ClientApp # 爬取 file_path = run_spider(DataZhejiangSpider) # 入库 data = json.loads(open(file_path, 'r', encoding='utf-8').read()) with ClientApp().app_context(): for item in data: print(item) DataZhejiang(**item).upsert()