init
This commit is contained in:
78
web/spiders/data_guangdong.py
Normal file
78
web/spiders/data_guangdong.py
Normal file
@@ -0,0 +1,78 @@
|
||||
import scrapy
|
||||
from scrapy import Request
|
||||
|
||||
|
||||
class DataGuangdongSpider(scrapy.Spider):
|
||||
name = "data_guangdong"
|
||||
start_urls = [
|
||||
('材料信息价', "http://zjz.gdcd.gov.cn/zjzgdcd/zjxx_clxxj/list.shtml"),
|
||||
('材料信息价', "http://zjz.gdcd.gov.cn/zjzgdcd/zjxx_clxxj/list_2.shtml"),
|
||||
('材料信息价', "http://zjz.gdcd.gov.cn/zjzgdcd/zjxx_clxxj/list_3.shtml"),
|
||||
('材料信息价', "http://zjz.gdcd.gov.cn/zjzgdcd/zjxx_clxxj/list_4.shtml"),
|
||||
('材料信息价', "http://zjz.gdcd.gov.cn/zjzgdcd/zjxx_clxxj/list_5.shtml"),
|
||||
]
|
||||
|
||||
def start_requests(self):
|
||||
for source, url in self.start_urls:
|
||||
yield Request(
|
||||
method='GET',
|
||||
url=url,
|
||||
headers={
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
|
||||
},
|
||||
meta={'source': source, 'type': 'home'}
|
||||
)
|
||||
|
||||
def parse(self, response, **kwargs):
|
||||
if response.meta['type'] == 'home':
|
||||
yield from self.parse_home(response)
|
||||
elif response.meta['type'] == 'list':
|
||||
yield from self.parse_list(response)
|
||||
|
||||
def parse_home(self, response):
|
||||
for item in response.xpath('/html/body/div/div[3]/div[2]/div[2]/div/ul/li/a'):
|
||||
uri = item.xpath('@href').get()
|
||||
name = item.xpath('text()').get()
|
||||
if '广东省交通建设工程主要外购材料信息价表' not in name:
|
||||
continue
|
||||
yield Request(
|
||||
method='GET',
|
||||
url=f'http://zjz.gdcd.gov.cn{uri}',
|
||||
headers={
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
|
||||
},
|
||||
meta={'source': name, 'type': 'list'}
|
||||
)
|
||||
|
||||
def parse_list(self, response):
|
||||
date = response.xpath('/html/body/div[1]/div[4]/div/div[1]/div[2]/span[2]/text()').get().strip('发布时间:')
|
||||
source = response.xpath('/html/body/div[1]/div[4]/div/div[1]/div[2]/span[1]/b/text()').get()
|
||||
for item in response.xpath('//*[@id="zoomcon"]/p/a'):
|
||||
uri = item.xpath('@href').get()
|
||||
name = item.xpath('text()').get()
|
||||
url_prefix = '/'.join(response.url.split('/')[:-1])
|
||||
print(uri, name)
|
||||
yield {
|
||||
'url': f'{url_prefix}/{uri}',
|
||||
'name': name,
|
||||
# 'source': response.meta['source'],
|
||||
'source': source,
|
||||
'date': date
|
||||
}
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import json
|
||||
|
||||
from spiders import run_spider
|
||||
from commons.models.data_guangdong import DataGuangdong
|
||||
from core.factory import ClientApp
|
||||
|
||||
# 爬取
|
||||
file_path = run_spider(DataGuangdongSpider)
|
||||
# 入库
|
||||
data = json.loads(open(file_path, 'r', encoding='utf-8').read())
|
||||
with ClientApp().app_context():
|
||||
for item in data:
|
||||
print(item)
|
||||
DataGuangdong(**item).upsert()
|
Reference in New Issue
Block a user