92 lines
		
	
	
		
			3.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
		
		
			
		
	
	
			92 lines
		
	
	
		
			3.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
|   | import datetime | ||
|  | from dateutil.relativedelta import relativedelta | ||
|  | 
 | ||
|  | import scrapy | ||
|  | from scrapy import Request | ||
|  | 
 | ||
|  | 
 | ||
|  | CITY_ID = [ | ||
|  |     ('93', '福州市'), | ||
|  |     ('94', '厦门市'), | ||
|  |     ('95', '宁德市'), | ||
|  |     ('96', '莆田市'), | ||
|  |     ('97', '泉州市'), | ||
|  |     ('98', '漳州市'), | ||
|  |     ('99', '龙岩市'), | ||
|  |     ('100', '三明市'), | ||
|  |     ('101', '南平市'), | ||
|  |     ('102', '平潭综合实验区'), | ||
|  | ] | ||
|  | 
 | ||
|  | CLASS_CODE = [ | ||
|  |     ('01', '01黑色及有色金属'), | ||
|  |     ('04', '04水泥、砖瓦灰砂'), | ||
|  |     ('05', '05木、竹材料及其制品'), | ||
|  |     ('13', '13涂料及防腐、防水材料'), | ||
|  |     ('14', '14油品、化工原料'), | ||
|  | ] | ||
|  | 
 | ||
|  | URL = 'http://49.4.85.126/Information/Index?qClassCode={class_code}&qMatType=0&WayID=14&WayID2=4&CityID=7&CityID2={city_id}&Year={year}&Month={month}&Week=0&Day=0&qKeyWord=' | ||
|  | MONTHS = 2  # 爬取最近月份数量 | ||
|  | 
 | ||
|  | 
 | ||
|  | class DataFujianSpider(scrapy.Spider): | ||
|  |     name = "data_fujian" | ||
|  | 
 | ||
|  |     def start_requests(self): | ||
|  |         for city_id, city_name in CITY_ID: | ||
|  |             for class_code, source in CLASS_CODE: | ||
|  |                 for month in range(1, 1 + MONTHS): | ||
|  |                     date = datetime.date.today() - relativedelta(months=month) | ||
|  |                     yield Request( | ||
|  |                         method='GET', | ||
|  |                         url=URL.format(year=date.year, month=date.month, class_code=class_code, city_id=city_id), | ||
|  |                         headers={ | ||
|  |                             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', | ||
|  |                         }, | ||
|  |                         meta={'source': source, 'type': 'home', 'city': city_name, 'month': date.month, 'year': date.year} | ||
|  |                     ) | ||
|  | 
 | ||
|  |     def parse(self, response, **kwargs): | ||
|  |         for item in response.xpath('//*[@id="searcList"]/div/div[2]/div[3]/div[3]/table/tr'): | ||
|  |             block_1 = [i.xpath('text()').get().strip() for i in item.xpath('td')] | ||
|  |             print(block_1) | ||
|  |             if not block_1: | ||
|  |                 continue | ||
|  |             number, _, type_, unit, price, price_with_tax, *_ = block_1 | ||
|  | 
 | ||
|  |             block_2 = [i.xpath('text()').get().strip() for i in item.xpath('td/span')] | ||
|  |             print(block_2) | ||
|  |             name, *_ = block_2 | ||
|  | 
 | ||
|  |             yield { | ||
|  |                 'number': number, | ||
|  |                 'name': name, | ||
|  |                 'spec': type_, | ||
|  |                 'unit': unit, | ||
|  |                 'price_without_tax': price, | ||
|  |                 'price': price_with_tax, | ||
|  |                 'category': response.meta['source'], | ||
|  |                 'year': response.meta['year'], | ||
|  |                 'month': response.meta['month'], | ||
|  |                 'city': response.meta['city'], | ||
|  |                 'date': datetime.date.today().strftime('%Y-%m-%d') | ||
|  |             } | ||
|  | 
 | ||
|  | 
 | ||
|  | if __name__ == '__main__': | ||
|  |     import json | ||
|  | 
 | ||
|  |     from spiders import run_spider | ||
|  |     from commons.models.data_fujian import DataFujian | ||
|  |     from core.factory import ClientApp | ||
|  | 
 | ||
|  |     # 爬取 | ||
|  |     file_path = run_spider(DataFujianSpider) | ||
|  |     # 入库 | ||
|  |     data = json.loads(open(file_path, 'r', encoding='utf-8').read()) | ||
|  |     with ClientApp().app_context(): | ||
|  |         for item in data: | ||
|  |             print(item) | ||
|  |             DataFujian(**item).upsert() |