This commit is contained in:
han0
2024-07-09 18:11:22 +08:00
parent e6442b2cf9
commit d2eca25632
7 changed files with 126 additions and 2 deletions

View File

@@ -0,0 +1,83 @@
import scrapy
from lxml import html, etree
from scrapy import Request
class DataJiangxiSpider(scrapy.Spider):
name = "data_jiangxi"
start_urls = [
('材料价格', "http://jt.jiangxi.gov.cn/col/col70716/index.html?uid=339408&pageNum=1"),
('材料价格', "http://jt.jiangxi.gov.cn/col/col70716/index.html?uid=339408&pageNum=2"),
('材料价格', "http://jt.jiangxi.gov.cn/col/col70716/index.html?uid=339408&pageNum=3"),
]
def start_requests(self):
for source, url in self.start_urls:
yield Request(
method='GET',
url=url,
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
},
meta={'source': source, 'type': 'home'}
)
def parse(self, response, **kwargs):
if response.meta['type'] == 'home':
yield from self.parse_home(response)
elif response.meta['type'] == 'list':
yield from self.parse_list(response)
def parse_home(self, response):
rsp = response.xpath('//*[@id="339408"]/script/text()').get()
for t in ('<![CDATA[', ']]>', '</record>', '<record>', '</recordset>', '<recordset>', '</datastore>', '<datastore>', '</nextgroup>', '<nextgroup>'):
rsp = rsp.replace(t, '')
html = etree.HTML(rsp)
for item in html.xpath('//li/a'):
print(item)
uri = item.xpath('@href')[0]
name = item.xpath('text()')[0]
print(uri, name)
if '工程材料价格信息' not in name:
continue
yield Request(
method='GET',
url=uri,
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
},
meta={'source': name, 'type': 'list'}
)
def parse_list(self, response):
date = response.xpath('//*[@id="content"]/div[1]/ul/li[3]/span/text()').get().split(' ')[0]
source = response.xpath('//*[@id="content"]/div[1]/ul/li[1]/span/text()').get()
for item in response.xpath('//*[@id="div_content"]/span/p/a'):
uri = item.xpath('@href').get()
name = item.xpath('text()').get()
print(uri, name)
yield {
'url': f'http://jt.jiangxi.gov.cn{uri}',
'name': name,
# 'source': response.meta['source']
'source': source,
'date': date,
}
# todo 江西造价站入库
if __name__ == '__main__':
import json
from spiders import run_spider
from commons.models.data_zhejiang import DataZhejiang
from core.factory import ClientApp
# 爬取
file_path = run_spider(DataJiangxiSpider)
# 入库
data = json.loads(open(file_path, 'r', encoding='utf-8').read())
with ClientApp().app_context():
for item in data:
print(item)
# DataJiangxi(**item).upsert()