init
This commit is contained in:
96
web/spiders/cement.py
Normal file
96
web/spiders/cement.py
Normal file
@@ -0,0 +1,96 @@
|
||||
import scrapy
|
||||
from scrapy import Request
|
||||
|
||||
|
||||
class CementSpider(scrapy.Spider):
|
||||
name = "cement"
|
||||
start_urls = [
|
||||
('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/1"),
|
||||
('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/2"),
|
||||
('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/3"),
|
||||
('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/4"),
|
||||
('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/5"),
|
||||
('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/6"),
|
||||
('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/7"),
|
||||
('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/8"),
|
||||
('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/9"),
|
||||
('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/10"),
|
||||
]
|
||||
cookie = None
|
||||
user_agent = None
|
||||
|
||||
def start_requests(self):
|
||||
for source, url in self.start_urls:
|
||||
yield Request(
|
||||
method='GET',
|
||||
url=url,
|
||||
headers={
|
||||
'User-Agent': self.user_agent,
|
||||
},
|
||||
cookies=self.cookie,
|
||||
meta={'source': source, 'type': 'home'}
|
||||
)
|
||||
|
||||
def parse(self, response, **kwargs):
|
||||
if response.meta['type'] == 'home':
|
||||
yield from self.parse_home(response)
|
||||
elif response.meta['type'] == 'list_page':
|
||||
yield from self.parse_list_page(response)
|
||||
|
||||
def parse_home(self, response):
|
||||
for item in response.xpath('//*[@id="__nuxt"]/div/div[5]/div/div[2]/div[1]/div[1]/div[2]/ul/li/a'):
|
||||
title = item.xpath('text()').get()
|
||||
if '福建水泥市场参考价格' in title:
|
||||
print(title, 'http://www.baiinfo.com{}'.format(item.xpath('@href').get()))
|
||||
yield Request(
|
||||
method='GET',
|
||||
url='http://www.baiinfo.com{}'.format(item.xpath('@href').get()),
|
||||
headers={
|
||||
'User-Agent': self.user_agent,
|
||||
},
|
||||
cookies=self.cookie,
|
||||
meta={'source': title, 'type': 'list_page'}
|
||||
)
|
||||
|
||||
def parse_list_page(self, response):
|
||||
date = response.xpath('//*[@id="__nuxt"]/div/div[5]/div/div[2]/div[1]/div[2]/p[1]/span[4]/text()').get()
|
||||
date = date.strip('日').replace('年', '-').replace('月', '-')
|
||||
|
||||
for item in response.xpath('//tr[position()>2]'):
|
||||
block_1 = [cell.get() for cell in item.xpath('td/span/span/span/span/span/text()') if cell.get()]
|
||||
price, *_ = block_1
|
||||
|
||||
block_2 = [cell.get() for cell in item.xpath('td/span/span/span/span/span/span/text()') if cell.get()]
|
||||
spec, name, pack, source, _, fluctuating = block_2
|
||||
|
||||
yield {
|
||||
'name': name,
|
||||
'price': price,
|
||||
'spec': spec,
|
||||
'pack': pack,
|
||||
'date': date,
|
||||
'source': source,
|
||||
'fluctuating': int(fluctuating)
|
||||
}
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import json
|
||||
|
||||
from spiders import run_spider, BaiinfoCookieTools
|
||||
from commons.models.cement import Cement
|
||||
from core.factory import ClientApp
|
||||
|
||||
# cookie 读取
|
||||
cookie = BaiinfoCookieTools.get_cookies()
|
||||
# 爬取
|
||||
CementSpider.cookie = cookie
|
||||
CementSpider.user_agent = BaiinfoCookieTools.user_agent
|
||||
file_path = run_spider(CementSpider)
|
||||
# 入库
|
||||
data = json.loads(open(file_path, 'r', encoding='utf-8').read())
|
||||
with ClientApp().app_context():
|
||||
for item in data:
|
||||
print(item)
|
||||
Cement(**item).upsert()
|
||||
|
Reference in New Issue
Block a user