import scrapy from lxml import html, etree from scrapy import Request class DataJiangxiSpider(scrapy.Spider): name = "data_jiangxi" start_urls = [ ('材料价格', "http://jt.jiangxi.gov.cn/col/col70716/index.html?uid=339408&pageNum=1"), ('材料价格', "http://jt.jiangxi.gov.cn/col/col70716/index.html?uid=339408&pageNum=2"), ('材料价格', "http://jt.jiangxi.gov.cn/col/col70716/index.html?uid=339408&pageNum=3"), ] def start_requests(self): for source, url in self.start_urls: yield Request( method='GET', url=url, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', }, meta={'source': source, 'type': 'home'} ) def parse(self, response, **kwargs): if response.meta['type'] == 'home': yield from self.parse_home(response) elif response.meta['type'] == 'list': yield from self.parse_list(response) def parse_home(self, response): rsp = response.xpath('//*[@id="339408"]/script/text()').get() for t in ('', '', '', '', '', '', '', '', ''): rsp = rsp.replace(t, '') html = etree.HTML(rsp) for item in html.xpath('//li/a'): print(item) uri = item.xpath('@href')[0] name = item.xpath('text()')[0] print(uri, name) if '工程材料价格信息' not in name: continue yield Request( method='GET', url=uri, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', }, meta={'source': name, 'type': 'list'} ) def parse_list(self, response): date = response.xpath('//*[@id="content"]/div[1]/ul/li[3]/span/text()').get().split(' ')[0] source = response.xpath('//*[@id="content"]/div[1]/ul/li[1]/span/text()').get() for item in response.xpath('//*[@id="div_content"]/span/p/a'): uri = item.xpath('@href').get() name = item.xpath('text()').get() print(uri, name) yield { 'url': f'http://jt.jiangxi.gov.cn{uri}', 'name': name, # 'source': response.meta['source'] 'source': source, 'date': date, } # todo-2 江西造价站入库 if __name__ == '__main__': import json from spiders import run_spider from commons.models.data_zhejiang import DataZhejiang from core.factory import ClientApp # 爬取 file_path = run_spider(DataJiangxiSpider) # 入库 data = json.loads(open(file_path, 'r', encoding='utf-8').read()) with ClientApp().app_context(): for item in data: print(item) # DataJiangxi(**item).upsert()