import datetime from dateutil.relativedelta import relativedelta import scrapy from scrapy import Request CITY_ID = [ ('93', '福州市'), ('94', '厦门市'), ('95', '宁德市'), ('96', '莆田市'), ('97', '泉州市'), ('98', '漳州市'), ('99', '龙岩市'), ('100', '三明市'), ('101', '南平市'), ('102', '平潭综合实验区'), ] CLASS_CODE = [ ('01', '01黑色及有色金属'), ('04', '04水泥、砖瓦灰砂'), ('05', '05木、竹材料及其制品'), ('13', '13涂料及防腐、防水材料'), ('14', '14油品、化工原料'), ] URL = 'http://49.4.85.126/Information/Index?qClassCode={class_code}&qMatType=0&WayID=14&WayID2=4&CityID=7&CityID2={city_id}&Year={year}&Month={month}&Week=0&Day=0&qKeyWord=' MONTHS = 2 # 爬取最近月份数量 class DataFujianSpider(scrapy.Spider): name = "data_fujian" def start_requests(self): for city_id, city_name in CITY_ID: for class_code, source in CLASS_CODE: for month in range(1, 1 + MONTHS): date = datetime.date.today() - relativedelta(months=month) yield Request( method='GET', url=URL.format(year=date.year, month=date.month, class_code=class_code, city_id=city_id), headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', }, meta={'source': source, 'type': 'home', 'city': city_name, 'month': date.month, 'year': date.year} ) def parse(self, response, **kwargs): for item in response.xpath('//*[@id="searcList"]/div/div[2]/div[3]/div[3]/table/tr'): block_1 = [i.xpath('text()').get().strip() for i in item.xpath('td')] print(block_1) if not block_1: continue number, _, type_, unit, price, price_with_tax, *_ = block_1 block_2 = [i.xpath('text()').get().strip() for i in item.xpath('td/span')] print(block_2) name, *_ = block_2 yield { 'number': number, 'name': name, 'spec': type_, 'unit': unit, 'price_without_tax': price, 'price': price_with_tax, 'category': response.meta['source'], 'year': response.meta['year'], 'month': response.meta['month'], 'city': response.meta['city'], 'date': datetime.datetime(response.meta['year'], response.meta['month'], 1).strftime('%Y-%m-%d') } if __name__ == '__main__': import json from spiders import run_spider from commons.models.data_fujian import DataFujian from core.factory import ClientApp # 爬取 file_path = run_spider(DataFujianSpider) # 入库 data = json.loads(open(file_path, 'r', encoding='utf-8').read()) with ClientApp().app_context(): for item in data: print(item) DataFujian(**item).upsert()