84 lines
		
	
	
		
			3.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
		
		
			
		
	
	
			84 lines
		
	
	
		
			3.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
|   | import scrapy | ||
|  | from lxml import html, etree | ||
|  | from scrapy import Request | ||
|  | 
 | ||
|  | 
 | ||
|  | class DataJiangxiSpider(scrapy.Spider): | ||
|  |     name = "data_jiangxi" | ||
|  |     start_urls = [ | ||
|  |         ('材料价格', "http://jt.jiangxi.gov.cn/col/col70716/index.html?uid=339408&pageNum=1"), | ||
|  |         ('材料价格', "http://jt.jiangxi.gov.cn/col/col70716/index.html?uid=339408&pageNum=2"), | ||
|  |         ('材料价格', "http://jt.jiangxi.gov.cn/col/col70716/index.html?uid=339408&pageNum=3"), | ||
|  |     ] | ||
|  | 
 | ||
|  |     def start_requests(self): | ||
|  |         for source, url in self.start_urls: | ||
|  |             yield Request( | ||
|  |                 method='GET', | ||
|  |                 url=url, | ||
|  |                 headers={ | ||
|  |                     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', | ||
|  |                 }, | ||
|  |                 meta={'source': source, 'type': 'home'} | ||
|  |             ) | ||
|  | 
 | ||
|  |     def parse(self, response, **kwargs): | ||
|  |         if response.meta['type'] == 'home': | ||
|  |             yield from self.parse_home(response) | ||
|  |         elif response.meta['type'] == 'list': | ||
|  |             yield from self.parse_list(response) | ||
|  | 
 | ||
|  |     def parse_home(self, response): | ||
|  |         rsp = response.xpath('//*[@id="339408"]/script/text()').get() | ||
|  |         for t in ('<![CDATA[', ']]>', '</record>', '<record>', '</recordset>', '<recordset>', '</datastore>', '<datastore>', '</nextgroup>', '<nextgroup>'): | ||
|  |             rsp = rsp.replace(t, '') | ||
|  |         html = etree.HTML(rsp) | ||
|  |         for item in html.xpath('//li/a'): | ||
|  |             print(item) | ||
|  |             uri = item.xpath('@href')[0] | ||
|  |             name = item.xpath('text()')[0] | ||
|  |             print(uri, name) | ||
|  |             if '工程材料价格信息' not in name: | ||
|  |                 continue | ||
|  |             yield Request( | ||
|  |                 method='GET', | ||
|  |                 url=uri, | ||
|  |                 headers={ | ||
|  |                     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', | ||
|  |                 }, | ||
|  |                 meta={'source': name, 'type': 'list'} | ||
|  |             ) | ||
|  | 
 | ||
|  |     def parse_list(self, response): | ||
|  |         date = response.xpath('//*[@id="content"]/div[1]/ul/li[3]/span/text()').get().split(' ')[0] | ||
|  |         source = response.xpath('//*[@id="content"]/div[1]/ul/li[1]/span/text()').get() | ||
|  |         for item in response.xpath('//*[@id="div_content"]/span/p/a'): | ||
|  |             uri = item.xpath('@href').get() | ||
|  |             name = item.xpath('text()').get() | ||
|  |             print(uri, name) | ||
|  |             yield { | ||
|  |                 'url': f'http://jt.jiangxi.gov.cn{uri}', | ||
|  |                 'name': name, | ||
|  |                 # 'source': response.meta['source'] | ||
|  |                 'source': source, | ||
|  |                 'date': date, | ||
|  |             } | ||
|  | 
 | ||
|  | # todo 江西造价站入库 | ||
|  | 
 | ||
|  | if __name__ == '__main__': | ||
|  |     import json | ||
|  | 
 | ||
|  |     from spiders import run_spider | ||
|  |     from commons.models.data_zhejiang import DataZhejiang | ||
|  |     from core.factory import ClientApp | ||
|  | 
 | ||
|  |     # 爬取 | ||
|  |     file_path = run_spider(DataJiangxiSpider) | ||
|  |     # 入库 | ||
|  |     data = json.loads(open(file_path, 'r', encoding='utf-8').read()) | ||
|  |     with ClientApp().app_context(): | ||
|  |         for item in data: | ||
|  |             print(item) | ||
|  |             # DataJiangxi(**item).upsert() |