init
This commit is contained in:
90
web/spiders/steel_rebar.py
Normal file
90
web/spiders/steel_rebar.py
Normal file
@@ -0,0 +1,90 @@
|
||||
import scrapy
|
||||
from scrapy import Request
|
||||
|
||||
from price.constants.mysteel import PageType
|
||||
|
||||
|
||||
class SteelRebarSpider(scrapy.Spider):
|
||||
name = "steel_rebar"
|
||||
start_urls = [
|
||||
(PageType.REBAR_LIST, "https://jiancai.mysteel.com/market/pa228aa010101a0a01010502aaaa1.html"),
|
||||
]
|
||||
cookie = None
|
||||
user_agent = None
|
||||
|
||||
def start_requests(self):
|
||||
for source, url in self.start_urls:
|
||||
yield Request(
|
||||
method='GET',
|
||||
url=url,
|
||||
headers={
|
||||
'User-Agent': self.user_agent,
|
||||
},
|
||||
meta={'source': source, 'type': source}
|
||||
)
|
||||
|
||||
def parse(self, response, **kwargs):
|
||||
if response.meta['type'] == PageType.REBAR_LIST:
|
||||
yield from self.parse_steel_list(response)
|
||||
elif response.meta['type'] == PageType.REBAR_DETAIL:
|
||||
yield from self.parse_steel_detail(response)
|
||||
|
||||
def parse_steel_list(self, response):
|
||||
for item in response.xpath('//*[@id="articleList"]/ul/li/a'):
|
||||
uri = item.xpath('@href').get()
|
||||
name = item.xpath('text()').get()
|
||||
print(uri, name)
|
||||
if ')福州市场建筑钢材价格行情' not in name:
|
||||
continue
|
||||
yield Request(
|
||||
method='GET',
|
||||
url=uri,
|
||||
headers={
|
||||
'User-Agent': self.user_agent,
|
||||
},
|
||||
cookies=self.cookie,
|
||||
meta={'source': name, 'type': PageType.REBAR_DETAIL}
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def parse_steel_detail(response):
|
||||
# 解析日期
|
||||
title = response.xpath('//*[@id="content-title"]/text()').get()
|
||||
date = title.split('日')[0].replace('年', '-').replace('月', '-')
|
||||
#
|
||||
for item in response.xpath('//*[@id="marketTable"]/tr[position()>2]'):
|
||||
line = [cell.xpath('text()').get().strip() for cell in item.xpath('td')]
|
||||
print(line)
|
||||
if len(line) < 8:
|
||||
continue
|
||||
name, spec, material, source, price, fluctuating, *_ = line
|
||||
yield {
|
||||
'name': name,
|
||||
'spec': spec,
|
||||
'material': material,
|
||||
'source': source,
|
||||
'price': int(price),
|
||||
'fluctuating': 0 if fluctuating == '-' else int(fluctuating),
|
||||
'date': date
|
||||
}
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import json
|
||||
|
||||
from spiders import run_spider, MysteelCookieTools
|
||||
from commons.models.steel_rebar import SteelRebar
|
||||
from core.factory import ClientApp
|
||||
|
||||
# cookie 读取
|
||||
cookie = MysteelCookieTools.get_cookies()
|
||||
# 爬取
|
||||
SteelRebarSpider.cookie = cookie
|
||||
SteelRebarSpider.user_agent = MysteelCookieTools.user_agent
|
||||
file_path = run_spider(SteelRebarSpider)
|
||||
# 入库
|
||||
data = json.loads(open(file_path, 'r', encoding='utf-8').read())
|
||||
with ClientApp().app_context():
|
||||
for item in data:
|
||||
print(item)
|
||||
SteelRebar(**item).upsert()
|
Reference in New Issue
Block a user