85 lines
3.4 KiB
Python
85 lines
3.4 KiB
Python
import scrapy
|
|
from lxml import html, etree
|
|
from scrapy import Request
|
|
|
|
|
|
class DataZhejiangSpider(scrapy.Spider):
|
|
name = "data_zhejiang"
|
|
start_urls = [
|
|
('材料价格', "http://jtyst.zj.gov.cn/col/col1228999576/index.html"),
|
|
('材料价格', "http://jtyst.zj.gov.cn/col/col1228999576/index.html?uid=5509220&pageNum=2"),
|
|
('材料价格', "http://jtyst.zj.gov.cn/col/col1228999576/index.html?uid=5509220&pageNum=3"),
|
|
('材料价格', "http://jtyst.zj.gov.cn/col/col1228999576/index.html?uid=5509220&pageNum=4"),
|
|
('材料价格', "http://jtyst.zj.gov.cn/col/col1228999576/index.html?uid=5509220&pageNum=5"),
|
|
]
|
|
|
|
def start_requests(self):
|
|
for source, url in self.start_urls:
|
|
yield Request(
|
|
method='GET',
|
|
url=url,
|
|
headers={
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
|
|
},
|
|
meta={'source': source, 'type': 'home'}
|
|
)
|
|
|
|
def parse(self, response, **kwargs):
|
|
if response.meta['type'] == 'home':
|
|
yield from self.parse_home(response)
|
|
elif response.meta['type'] == 'list':
|
|
yield from self.parse_list(response)
|
|
|
|
def parse_home(self, response):
|
|
rsp = response.xpath('//*[@id="5509220"]/script/text()').get()
|
|
for t in ('<![CDATA[', ']]>', '</record>', '<record>', '</recordset>', '<recordset>', '</datastore>', '<datastore>', '</nextgroup>', '<nextgroup>'):
|
|
rsp = rsp.replace(t, '')
|
|
html = etree.HTML(rsp)
|
|
for item in html.xpath('//li/a'):
|
|
print(item)
|
|
uri = item.xpath('@href')[0]
|
|
name = item.xpath('text()')[0]
|
|
print(uri, name)
|
|
if '《质监与造价》价格信息专辑' not in name:
|
|
continue
|
|
yield Request(
|
|
method='GET',
|
|
url=f'http://jtyst.zj.gov.cn{uri}',
|
|
headers={
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
|
|
},
|
|
meta={'source': name, 'type': 'list'}
|
|
)
|
|
|
|
def parse_list(self, response):
|
|
date = response.xpath('/html/body/div[2]/div[1]/div[2]/div[1]/p/span[1]/text()').get().split('日')[0].replace('年', '-').replace('月', '-')
|
|
source = response.xpath('/html/body/div[2]/div[1]/div[2]/div[1]/p/span[2]/text()[2]').get()
|
|
for item in response.xpath('//*[@id="zoom"]/p/a'):
|
|
uri = item.xpath('@href').get()
|
|
name = item.xpath('text()').get()
|
|
print(uri, name)
|
|
yield {
|
|
'url': f'https://jtyst.zj.gov.cn{uri}',
|
|
'name': name,
|
|
# 'source': response.meta['source']
|
|
'source': source,
|
|
'date': date,
|
|
}
|
|
|
|
|
|
if __name__ == '__main__':
|
|
import json
|
|
|
|
from spiders import run_spider
|
|
from commons.models.data_zhejiang import DataZhejiang
|
|
from core.factory import ClientApp
|
|
|
|
# 爬取
|
|
file_path = run_spider(DataZhejiangSpider)
|
|
# 入库
|
|
data = json.loads(open(file_path, 'r', encoding='utf-8').read())
|
|
with ClientApp().app_context():
|
|
for item in data:
|
|
print(item)
|
|
DataZhejiang(**item).upsert()
|