init
This commit is contained in:
91
web/spiders/data_fujian.py
Normal file
91
web/spiders/data_fujian.py
Normal file
@@ -0,0 +1,91 @@
|
||||
import datetime
|
||||
from dateutil.relativedelta import relativedelta
|
||||
|
||||
import scrapy
|
||||
from scrapy import Request
|
||||
|
||||
|
||||
CITY_ID = [
|
||||
('93', '福州市'),
|
||||
('94', '厦门市'),
|
||||
('95', '宁德市'),
|
||||
('96', '莆田市'),
|
||||
('97', '泉州市'),
|
||||
('98', '漳州市'),
|
||||
('99', '龙岩市'),
|
||||
('100', '三明市'),
|
||||
('101', '南平市'),
|
||||
('102', '平潭综合实验区'),
|
||||
]
|
||||
|
||||
CLASS_CODE = [
|
||||
('01', '01黑色及有色金属'),
|
||||
('04', '04水泥、砖瓦灰砂'),
|
||||
('05', '05木、竹材料及其制品'),
|
||||
('13', '13涂料及防腐、防水材料'),
|
||||
('14', '14油品、化工原料'),
|
||||
]
|
||||
|
||||
URL = 'http://49.4.85.126/Information/Index?qClassCode={class_code}&qMatType=0&WayID=14&WayID2=4&CityID=7&CityID2={city_id}&Year={year}&Month={month}&Week=0&Day=0&qKeyWord='
|
||||
MONTHS = 2 # 爬取最近月份数量
|
||||
|
||||
|
||||
class DataFujianSpider(scrapy.Spider):
|
||||
name = "data_fujian"
|
||||
|
||||
def start_requests(self):
|
||||
for city_id, city_name in CITY_ID:
|
||||
for class_code, source in CLASS_CODE:
|
||||
for month in range(1, 1 + MONTHS):
|
||||
date = datetime.date.today() - relativedelta(months=month)
|
||||
yield Request(
|
||||
method='GET',
|
||||
url=URL.format(year=date.year, month=date.month, class_code=class_code, city_id=city_id),
|
||||
headers={
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
|
||||
},
|
||||
meta={'source': source, 'type': 'home', 'city': city_name, 'month': date.month, 'year': date.year}
|
||||
)
|
||||
|
||||
def parse(self, response, **kwargs):
|
||||
for item in response.xpath('//*[@id="searcList"]/div/div[2]/div[3]/div[3]/table/tr'):
|
||||
block_1 = [i.xpath('text()').get().strip() for i in item.xpath('td')]
|
||||
print(block_1)
|
||||
if not block_1:
|
||||
continue
|
||||
number, _, type_, unit, price, price_with_tax, *_ = block_1
|
||||
|
||||
block_2 = [i.xpath('text()').get().strip() for i in item.xpath('td/span')]
|
||||
print(block_2)
|
||||
name, *_ = block_2
|
||||
|
||||
yield {
|
||||
'number': number,
|
||||
'name': name,
|
||||
'spec': type_,
|
||||
'unit': unit,
|
||||
'price_without_tax': price,
|
||||
'price': price_with_tax,
|
||||
'category': response.meta['source'],
|
||||
'year': response.meta['year'],
|
||||
'month': response.meta['month'],
|
||||
'city': response.meta['city'],
|
||||
'date': datetime.date.today().strftime('%Y-%m-%d')
|
||||
}
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import json
|
||||
|
||||
from spiders import run_spider
|
||||
from commons.models.data_fujian import DataFujian
|
||||
from core.factory import ClientApp
|
||||
|
||||
# 爬取
|
||||
file_path = run_spider(DataFujianSpider)
|
||||
# 入库
|
||||
data = json.loads(open(file_path, 'r', encoding='utf-8').read())
|
||||
with ClientApp().app_context():
|
||||
for item in data:
|
||||
print(item)
|
||||
DataFujian(**item).upsert()
|
Reference in New Issue
Block a user