import scrapy from scrapy import Request class DataGuangdongSpider(scrapy.Spider): name = "data_guangdong" start_urls = [ ('材料信息价', "http://zjz.gdcd.gov.cn/zjzgdcd/zjxx_clxxj/list.shtml"), ('材料信息价', "http://zjz.gdcd.gov.cn/zjzgdcd/zjxx_clxxj/list_2.shtml"), ('材料信息价', "http://zjz.gdcd.gov.cn/zjzgdcd/zjxx_clxxj/list_3.shtml"), ('材料信息价', "http://zjz.gdcd.gov.cn/zjzgdcd/zjxx_clxxj/list_4.shtml"), ('材料信息价', "http://zjz.gdcd.gov.cn/zjzgdcd/zjxx_clxxj/list_5.shtml"), ] def start_requests(self): for source, url in self.start_urls: yield Request( method='GET', url=url, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', }, meta={'source': source, 'type': 'home'} ) def parse(self, response, **kwargs): if response.meta['type'] == 'home': yield from self.parse_home(response) elif response.meta['type'] == 'list': yield from self.parse_list(response) def parse_home(self, response): for item in response.xpath('/html/body/div/div[3]/div[2]/div[2]/div/ul/li/a'): uri = item.xpath('@href').get() name = item.xpath('text()').get() if '广东省交通建设工程主要外购材料信息价表' not in name: continue yield Request( method='GET', url=f'http://zjz.gdcd.gov.cn{uri}', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', }, meta={'source': name, 'type': 'list'} ) def parse_list(self, response): date = response.xpath('/html/body/div[1]/div[4]/div/div[1]/div[2]/span[2]/text()').get().strip('发布时间:') source = response.xpath('/html/body/div[1]/div[4]/div/div[1]/div[2]/span[1]/b/text()').get() for item in response.xpath('//*[@id="zoomcon"]/p/a'): uri = item.xpath('@href').get() name = item.xpath('text()').get() url_prefix = '/'.join(response.url.split('/')[:-1]) print(uri, name) yield { 'url': f'{url_prefix}/{uri}', 'name': name, # 'source': response.meta['source'], 'source': source, 'date': date } if __name__ == '__main__': import json from spiders import run_spider from commons.models.data_guangdong import DataGuangdong from core.factory import ClientApp # 爬取 file_path = run_spider(DataGuangdongSpider) # 入库 data = json.loads(open(file_path, 'r', encoding='utf-8').read()) with ClientApp().app_context(): for item in data: print(item) DataGuangdong(**item).upsert()