Files
material-api/web/spiders/steel_rebar.py
2024-05-29 10:21:31 +08:00

91 lines
3.0 KiB
Python

import scrapy
from scrapy import Request
from price.constants.mysteel import PageType
class SteelRebarSpider(scrapy.Spider):
name = "steel_rebar"
start_urls = [
(PageType.REBAR_LIST, "https://jiancai.mysteel.com/market/pa228aa010101a0a01010502aaaa1.html"),
]
cookie = None
user_agent = None
def start_requests(self):
for source, url in self.start_urls:
yield Request(
method='GET',
url=url,
headers={
'User-Agent': self.user_agent,
},
meta={'source': source, 'type': source}
)
def parse(self, response, **kwargs):
if response.meta['type'] == PageType.REBAR_LIST:
yield from self.parse_steel_list(response)
elif response.meta['type'] == PageType.REBAR_DETAIL:
yield from self.parse_steel_detail(response)
def parse_steel_list(self, response):
for item in response.xpath('//*[@id="articleList"]/ul/li/a'):
uri = item.xpath('@href').get()
name = item.xpath('text()').get()
print(uri, name)
if ')福州市场建筑钢材价格行情' not in name:
continue
yield Request(
method='GET',
url=uri,
headers={
'User-Agent': self.user_agent,
},
cookies=self.cookie,
meta={'source': name, 'type': PageType.REBAR_DETAIL}
)
@staticmethod
def parse_steel_detail(response):
# 解析日期
title = response.xpath('//*[@id="content-title"]/text()').get()
date = title.split('')[0].replace('', '-').replace('', '-')
#
for item in response.xpath('//*[@id="marketTable"]/tr[position()>2]'):
line = [cell.xpath('text()').get().strip() for cell in item.xpath('td')]
print(line)
if len(line) < 8:
continue
name, spec, material, source, price, fluctuating, *_ = line
yield {
'name': name,
'spec': spec,
'material': material,
'source': source,
'price': int(price),
'fluctuating': 0 if fluctuating == '-' else int(fluctuating),
'date': date
}
if __name__ == '__main__':
import json
from spiders import run_spider, MysteelCookieTools
from commons.models.steel_rebar import SteelRebar
from core.factory import ClientApp
# cookie 读取
cookie = MysteelCookieTools.get_cookies()
# 爬取
SteelRebarSpider.cookie = cookie
SteelRebarSpider.user_agent = MysteelCookieTools.user_agent
file_path = run_spider(SteelRebarSpider)
# 入库
data = json.loads(open(file_path, 'r', encoding='utf-8').read())
with ClientApp().app_context():
for item in data:
print(item)
SteelRebar(**item).upsert()