86 lines
3.3 KiB
Python
86 lines
3.3 KiB
Python
import scrapy
|
|
from scrapy import Request
|
|
|
|
|
|
class AsphaltImportedSpider(scrapy.Spider):
|
|
name = "asphalt_imported"
|
|
start_urls = [
|
|
('沥青', "http://www.baiinfo.com/news/newscategory/17847/18/1"),
|
|
('沥青', "http://www.baiinfo.com/news/newscategory/17847/18/2"),
|
|
('沥青', "http://www.baiinfo.com/news/newscategory/17847/18/3"),
|
|
('沥青', "http://www.baiinfo.com/news/newscategory/17847/18/4"),
|
|
('沥青', "http://www.baiinfo.com/news/newscategory/17847/18/5"),
|
|
]
|
|
cookie = None
|
|
user_agent = None
|
|
|
|
def start_requests(self):
|
|
for source, url in self.start_urls:
|
|
yield Request(
|
|
method='GET',
|
|
url=url,
|
|
headers={
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
|
|
},
|
|
cookies=self.cookie,
|
|
meta={'source': source, 'type': 'home'}
|
|
)
|
|
|
|
def parse(self, response, **kwargs):
|
|
if response.meta['type'] == 'home':
|
|
yield from self.parse_home(response)
|
|
elif response.meta['type'] == 'list_page':
|
|
yield from self.parse_list_page(response)
|
|
|
|
def parse_home(self, response):
|
|
for item in response.xpath('//*[@id="__nuxt"]/div/div[5]/div/div[2]/div[1]/div[1]/div[2]/ul/li/a'):
|
|
name = item.xpath('text()').get()
|
|
if '散装进口沥青到岸价' in name:
|
|
print(name, 'http://www.baiinfo.com{}'.format(item.xpath('@href').get()))
|
|
yield Request(
|
|
method='GET',
|
|
url='http://www.baiinfo.com{}'.format(item.xpath('@href').get()),
|
|
headers={
|
|
'User-Agent': self.user_agent,
|
|
},
|
|
cookies=self.cookie,
|
|
meta={'source': name, 'type': 'list_page'}
|
|
)
|
|
|
|
def parse_list_page(self, response):
|
|
date = response.xpath('//*[@id="__nuxt"]/div/div[5]/div/div[2]/div[1]/div[2]/p[1]/span[4]/text()').get()
|
|
date = date.strip('日').replace('年', '-').replace('月', '-')
|
|
|
|
for item in response.xpath('//*[@id="__nuxt"]/div/div[5]/div/div[2]/div[1]/div[3]/div/div/table/tbody/tr'):
|
|
line = [cell.xpath('text()').get() for cell in item.xpath('td/span/span/span/span/span')][:7]
|
|
print(line)
|
|
if line[-1] == '备注' or '品质' in line[0]:
|
|
continue
|
|
name, *_, price, fluctuating = line
|
|
yield {
|
|
'name': name,
|
|
'date': date,
|
|
'price': int(price.split('-')[-1]),
|
|
}
|
|
|
|
|
|
if __name__ == '__main__':
|
|
import json
|
|
|
|
from spiders import run_spider, BaiinfoCookieTools
|
|
from commons.models.asphalt_imported import AsphaltImported
|
|
from core.factory import ClientApp
|
|
|
|
# cookie 读取
|
|
cookie = BaiinfoCookieTools.get_cookies()
|
|
# 爬取
|
|
AsphaltImportedSpider.cookie = cookie
|
|
AsphaltImportedSpider.user_agent = BaiinfoCookieTools.user_agent
|
|
file_path = run_spider(AsphaltImportedSpider)
|
|
# 入库
|
|
data = json.loads(open(file_path, 'r', encoding='utf-8').read())
|
|
with ClientApp().app_context():
|
|
for item in data:
|
|
print(item)
|
|
AsphaltImported(**item).upsert()
|