init
This commit is contained in:
83
web/spiders/oil.py
Normal file
83
web/spiders/oil.py
Normal file
@@ -0,0 +1,83 @@
|
||||
import json
|
||||
|
||||
import scrapy
|
||||
from scrapy import Request
|
||||
|
||||
|
||||
class OilSpider(scrapy.Spider):
|
||||
name = "oil"
|
||||
start_urls = [
|
||||
('成品油价格调整', "https://fgw.fujian.gov.cn/was5/web/search?channelid=217025&templet=advsch.jsp&sortfield=-docreltime&classsql=%25%E6%88%90%E5%93%81%E6%B2%B9%E4%BB%B7%E6%A0%BC%E8%B0%83%E6%95%B4%25*siteid%3D31*siteid%3D31&prepage=100&page=1"),
|
||||
]
|
||||
|
||||
def start_requests(self):
|
||||
for source, url in self.start_urls:
|
||||
yield Request(
|
||||
method='GET',
|
||||
url=url,
|
||||
headers={
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
|
||||
},
|
||||
meta={'source': source, 'type': 'list'}
|
||||
)
|
||||
|
||||
def parse(self, response, **kwargs):
|
||||
if response.meta['type'] == 'list':
|
||||
yield from self.parse_list(response)
|
||||
elif response.meta['type'] == 'detail':
|
||||
yield from self.parse_detail(response)
|
||||
|
||||
def parse_list(self, response):
|
||||
ret = json.loads(response.text.replace('\n', ''))
|
||||
if ret.get('count') and ret.get('docs'):
|
||||
for item in ret['docs']:
|
||||
if not item.get('title2'):
|
||||
continue
|
||||
print(f"{item['title2']} {item['pubtime']}")
|
||||
yield Request(
|
||||
method='GET',
|
||||
url=item['url'],
|
||||
meta={'source': f"{item['title2']} {item['pubtime']}", 'type': 'detail', 'time': item['pubtime']}
|
||||
)
|
||||
|
||||
def parse_detail(self, response):
|
||||
for item in response.xpath('//table[1]/tbody/tr'):
|
||||
if len([i.get() for i in item.xpath('td/span/text()')]) > 0:
|
||||
first_word = item.xpath('td/span/text()').get()
|
||||
print()
|
||||
if first_word.strip() == '油品' or first_word.strip() == '元/吨':
|
||||
continue
|
||||
name = first_word
|
||||
price, *_ = [i.get() for i in item.xpath('td/text()')]
|
||||
yield {
|
||||
'name': name,
|
||||
'price': int(price),
|
||||
'date': response.meta['time'].split(' ')[0],
|
||||
}
|
||||
elif len([i.get() for i in item.xpath('td/text()')]) > 0:
|
||||
first_word = item.xpath('td/text()').get()
|
||||
if first_word.strip() == '油品' or first_word.strip() == '元/吨' or first_word.startswith('\xa0') or first_word.startswith('\n'):
|
||||
continue
|
||||
name, price, *_ = [i.get() for i in item.xpath('td/text()')]
|
||||
yield {
|
||||
'name': name,
|
||||
'price': price,
|
||||
'date': response.meta['time'].split(' ')[0],
|
||||
}
|
||||
else:
|
||||
print()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
from spiders import run_spider
|
||||
from commons.models.oil import Oil
|
||||
from core.factory import ClientApp
|
||||
|
||||
# 爬取
|
||||
file_path = run_spider(OilSpider)
|
||||
# 入库
|
||||
data = json.loads(open(file_path, 'r', encoding='utf-8').read())
|
||||
with ClientApp().app_context():
|
||||
for item in data:
|
||||
print(item)
|
||||
Oil(**item).upsert()
|
Reference in New Issue
Block a user