Files
material-api/web/spiders/steel_plate.py
2024-05-29 10:21:31 +08:00

90 lines
3.0 KiB
Python

import scrapy
from scrapy import Request
from commons.constants.mysteel import PageType
class SteelPlateSpider(scrapy.Spider):
name = "steel_plate"
start_urls = [
(PageType.PLATE_LIST, "https://list1.mysteel.com/market/p-219-----010102-0-01010502-------1.html"),
]
cookie = None
user_agent = None
def start_requests(self):
for source, url in self.start_urls:
yield Request(
method='GET',
url=url,
headers={
'User-Agent': self.user_agent,
},
meta={'source': source, 'type': source}
)
def parse(self, response, **kwargs):
if response.meta['type'] == PageType.PLATE_LIST:
yield from self.parse_board_list(response)
if response.meta['type'] == PageType.PLATE_DETAIL:
yield from self.parse_board_detail(response)
def parse_board_list(self, response):
for item in response.xpath('//*[@id="articleList"]/ul/li/a'):
uri = item.xpath('@href').get()
name = item.xpath('text()').get()
print(uri, name)
if '福州市场中厚板价格行情' not in name:
continue
yield Request(
method='GET',
url=uri,
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
},
cookies=self.cookie,
meta={'source': name, 'type': PageType.PLATE_DETAIL}
)
def parse_board_detail(self, response):
title = response.xpath('//*[@id="content-title"]/text()').get()
date = title.split('')[0].replace('', '-').replace('', '-')
for item in response.xpath('//*[@id="marketTable"]/tr[position()>2]'):
line = [cell.xpath('text()').get().strip() for cell in item.xpath('td')]
print(line)
if len(line) < 7:
continue
name, spec, material, source, price, fluctuating, *_ = line
yield {
'name': name,
'spec': spec,
'material': material,
'source': source,
'price': int(price),
'fluctuating': 0 if fluctuating == '-' else int(fluctuating),
'date': date
}
if __name__ == '__main__':
import json
from spiders import run_spider, MysteelCookieTools
from commons.models.steel_plate import SteelPlate
from core.factory import ClientApp
# cookie 读取
cookie = MysteelCookieTools.get_cookies()
# 爬取
SteelPlateSpider.cookie = cookie
SteelPlateSpider.user_agent = MysteelCookieTools.user_agent
file_path = run_spider(SteelPlateSpider)
# 入库
data = json.loads(open(file_path, 'r', encoding='utf-8').read())
with ClientApp().app_context():
for item in data:
print(item)
SteelPlate(**item).upsert()