init
This commit is contained in:
157
web/spiders/__init__.py
Normal file
157
web/spiders/__init__.py
Normal file
@@ -0,0 +1,157 @@
|
||||
import json
|
||||
import logging
|
||||
import pathlib
|
||||
from urllib.parse import unquote
|
||||
|
||||
import requests
|
||||
from lxml import etree
|
||||
from scrapy.crawler import CrawlerProcess
|
||||
from scrapy.utils.project import get_project_settings
|
||||
|
||||
from utils.login import login_mysteel, login_baiinfo
|
||||
|
||||
|
||||
def run_spider(spider):
|
||||
filename = f'{spider.name}.json'
|
||||
settings = get_project_settings()
|
||||
settings.set('FEEDS', {
|
||||
f'{filename}': {
|
||||
'format': 'json',
|
||||
'encoding': 'utf8',
|
||||
'overwrite': True,
|
||||
},
|
||||
})
|
||||
process = CrawlerProcess(settings)
|
||||
process.crawl(spider)
|
||||
process.start()
|
||||
return pathlib.Path(filename) # .absolute()
|
||||
|
||||
class CookieTools:
|
||||
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
|
||||
|
||||
|
||||
class MysteelCookieTools(CookieTools):
|
||||
|
||||
@staticmethod
|
||||
def is_valid(cookies):
|
||||
result = requests.get('https://e.mysteel.com/account_index.htm', cookies=cookies)
|
||||
result.encoding = result.apparent_encoding
|
||||
tree = etree.HTML(result.text)
|
||||
|
||||
flag_tag = tree.xpath('/html/body/div[1]/div[1]/p[2]/text()')
|
||||
print(flag_tag)
|
||||
if len(flag_tag) > 0 and flag_tag[0] == '会员登录':
|
||||
logging.warning('Mysteel.com Cookies 无效或已过期 | 强制跳转至登陆页')
|
||||
return False
|
||||
|
||||
flag_tag = tree.xpath('//*[@id="top"]/div/span[2]/a/text()')
|
||||
print(flag_tag)
|
||||
if len(flag_tag) > 0 and flag_tag[0] == '退出登录':
|
||||
pass
|
||||
else:
|
||||
logging.warning('Mysteel.com Cookies 无效或已过期 | 无法正确进入个人中心')
|
||||
return False
|
||||
|
||||
logging.warning('Mysteel.com Cookies 验证成功 | 成功进入个人中心')
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
def save_as_json(raw_cookies, file_path):
|
||||
try:
|
||||
login_token = [i for i in raw_cookies if i['name'] == '_login_token'][0].get('value')
|
||||
session = [i for i in raw_cookies if i['name'] == '_MSPASS_SESSION'][0].get('value')
|
||||
text = json.dumps({
|
||||
'_login_token': login_token,
|
||||
login_token: '1=10',
|
||||
'_MSPASS_SESSION': session
|
||||
})
|
||||
with open(file_path, 'w') as f:
|
||||
f.write(text)
|
||||
return file_path
|
||||
except IndexError:
|
||||
logging.warning('保存失败 | 无法正确解析原始 cookies')
|
||||
|
||||
@staticmethod
|
||||
def read_from_json(file_path):
|
||||
cookies = json.loads(open(file_path, 'r', encoding='utf-8').read())
|
||||
return cookies
|
||||
|
||||
@classmethod
|
||||
def get_cookies(cls, file_path=r'E:\Project\item_spider\mysteel.cookies.json'):
|
||||
cookie_json_file_path = file_path
|
||||
cookie = cls.read_from_json(cookie_json_file_path)
|
||||
if not cls.is_valid(cookie):
|
||||
raw_cookies = login_mysteel()
|
||||
cls.save_as_json(raw_cookies, cookie_json_file_path)
|
||||
cookie = cls.read_from_json(cookie_json_file_path)
|
||||
return cookie
|
||||
|
||||
|
||||
class BaiinfoCookieTools(CookieTools):
|
||||
@staticmethod
|
||||
def is_valid(cookies):
|
||||
if not cookies:
|
||||
return False
|
||||
result = requests.post(
|
||||
'http://www.baiinfo.com/api/website/price/priceInfo/getPriceList',
|
||||
json={"channelId": "18", "pricesGroupId": 526},
|
||||
# cookies=cookies,
|
||||
headers={
|
||||
'Baiinfo-Auth': json.loads(cookies['user'])['token'],
|
||||
'User-Agent': CookieTools.user_agent
|
||||
}
|
||||
)
|
||||
flag = json.loads(result.text)
|
||||
if flag['code'] != 200:
|
||||
logging.warning(f'Baiinfo.com Token 无效或已过期 | {flag["msg"]}')
|
||||
return False
|
||||
|
||||
result = requests.get('http://www.baiinfo.com/news/newscategory/4710/99/3', cookies=cookies, headers={'User-Agent': 'PostmanRuntime/7.26.8'})
|
||||
# result.encoding = result.apparent_encoding
|
||||
tree = etree.HTML(result.text)
|
||||
|
||||
flag_tag = tree.xpath('//head/title/text()')
|
||||
print(flag_tag)
|
||||
if len(flag_tag) > 0 and '用户登录' in flag_tag[0]:
|
||||
logging.warning('Baiinfo.com Cookies 无效或已过期 | 强制跳转至登陆页')
|
||||
return False
|
||||
|
||||
flag_tag = tree.xpath('//head/title/text()')
|
||||
print(flag_tag)
|
||||
if len(flag_tag) > 0 and '水泥价格(华东) - 百川盈孚' in flag_tag[0]:
|
||||
pass
|
||||
else:
|
||||
logging.warning('Baiinfo.com Cookies 无效或已过期 | 无法正确进入鉴权页面')
|
||||
return False
|
||||
|
||||
logging.warning('Baiinfo.com Cookies 验证成功 | 成功进入个鉴权页面')
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
def read_from_json(file_path):
|
||||
raw_cookies = json.loads(open(file_path, 'r', encoding='utf-8').read() or '[]')
|
||||
user_cookie = [i for i in raw_cookies if i['name'] == 'user']
|
||||
if not user_cookie:
|
||||
return None
|
||||
user = user_cookie[0].get('value')
|
||||
user = unquote(user)
|
||||
return {'user': user}
|
||||
|
||||
@staticmethod
|
||||
def save_as_json(raw_cookies, file_path):
|
||||
try:
|
||||
with open(file_path, 'w') as f:
|
||||
f.write(json.dumps(raw_cookies))
|
||||
return file_path
|
||||
except IndexError:
|
||||
logging.warning('保存失败 | 无法正确解析原始 cookies')
|
||||
|
||||
@classmethod
|
||||
def get_cookies(cls, file_path=r'E:\Project\item_spider\baiinfo.cookies.json'):
|
||||
cookie_json_file_path = file_path
|
||||
cookie = cls.read_from_json(cookie_json_file_path)
|
||||
if not cls.is_valid(cookie):
|
||||
raw_cookies = login_baiinfo()
|
||||
cls.save_as_json(raw_cookies, cookie_json_file_path)
|
||||
cookie = cls.read_from_json(cookie_json_file_path)
|
||||
return cookie
|
79
web/spiders/asphalt_domestic.py
Normal file
79
web/spiders/asphalt_domestic.py
Normal file
@@ -0,0 +1,79 @@
|
||||
import scrapy
|
||||
from scrapy import Request
|
||||
|
||||
|
||||
|
||||
class AsphaltDomesticSpider(scrapy.Spider):
|
||||
name = "asphalt_domestic"
|
||||
start_urls = [
|
||||
('国内炼厂重交沥青出厂价格', "http://www.baiinfo.com/api/website/price/priceInfo/getPriceList",
|
||||
{"channelId": "18", "pricesGroupId": 526}),
|
||||
('国内市场沥青批发价格汇总', "http://www.baiinfo.com/api/website/price/priceInfo/getPriceList",
|
||||
{"channelId": "18", "pricesGroupId": 530}),
|
||||
]
|
||||
cookie = None
|
||||
user_agent = None
|
||||
_token = None
|
||||
|
||||
@property
|
||||
def token(self):
|
||||
if self._token:
|
||||
return self._token
|
||||
else:
|
||||
self._token = json.loads(cookie['user'])['token']
|
||||
return self._token
|
||||
|
||||
def start_requests(self):
|
||||
for source, url, data in self.start_urls:
|
||||
yield Request(
|
||||
method='POST',
|
||||
body=json.dumps(data),
|
||||
url=url,
|
||||
headers={
|
||||
'User-Agent': self.user_agent,
|
||||
'Content-Type': 'application/json',
|
||||
'Baiinfo-Auth': self.token,
|
||||
# 'Baiinfo-Auth': TOKEN,
|
||||
},
|
||||
meta={'source': source}
|
||||
)
|
||||
|
||||
def parse(self, response, **kwargs):
|
||||
ret = json.loads(response.text)
|
||||
if ret.get('success') and ret.get('data'):
|
||||
for item in ret['data']:
|
||||
item['source'] = response.meta['source']
|
||||
for date in item['priceData']:
|
||||
try:
|
||||
price = int(item['priceData'][date])
|
||||
except ValueError:
|
||||
price = 0
|
||||
yield {
|
||||
'name': item['targetName'],
|
||||
'price': price,
|
||||
'date': date,
|
||||
# 'fluctuating': item['changePriceData'][date],
|
||||
'from_': response.meta['source'],
|
||||
}
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import json
|
||||
|
||||
from spiders import run_spider, BaiinfoCookieTools
|
||||
from commons.models.asphalt_domestic import AsphaltDomestic
|
||||
from core.factory import ClientApp
|
||||
|
||||
# cookie 读取
|
||||
cookie = BaiinfoCookieTools.get_cookies()
|
||||
# 爬取
|
||||
AsphaltDomesticSpider.cookie = cookie
|
||||
AsphaltDomesticSpider.user_agent = BaiinfoCookieTools.user_agent
|
||||
file_path = run_spider(AsphaltDomesticSpider)
|
||||
# 入库
|
||||
data = json.loads(open(file_path, 'r', encoding='utf-8').read())
|
||||
with ClientApp().app_context():
|
||||
for item in data:
|
||||
print(item)
|
||||
AsphaltDomestic(**item).upsert()
|
||||
|
85
web/spiders/asphalt_imported.py
Normal file
85
web/spiders/asphalt_imported.py
Normal file
@@ -0,0 +1,85 @@
|
||||
import scrapy
|
||||
from scrapy import Request
|
||||
|
||||
|
||||
class AsphaltImportedSpider(scrapy.Spider):
|
||||
name = "asphalt_imported"
|
||||
start_urls = [
|
||||
('沥青', "http://www.baiinfo.com/news/newscategory/17847/18/1"),
|
||||
('沥青', "http://www.baiinfo.com/news/newscategory/17847/18/2"),
|
||||
('沥青', "http://www.baiinfo.com/news/newscategory/17847/18/3"),
|
||||
('沥青', "http://www.baiinfo.com/news/newscategory/17847/18/4"),
|
||||
('沥青', "http://www.baiinfo.com/news/newscategory/17847/18/5"),
|
||||
]
|
||||
cookie = None
|
||||
user_agent = None
|
||||
|
||||
def start_requests(self):
|
||||
for source, url in self.start_urls:
|
||||
yield Request(
|
||||
method='GET',
|
||||
url=url,
|
||||
headers={
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
|
||||
},
|
||||
cookies=self.cookie,
|
||||
meta={'source': source, 'type': 'home'}
|
||||
)
|
||||
|
||||
def parse(self, response, **kwargs):
|
||||
if response.meta['type'] == 'home':
|
||||
yield from self.parse_home(response)
|
||||
elif response.meta['type'] == 'list_page':
|
||||
yield from self.parse_list_page(response)
|
||||
|
||||
def parse_home(self, response):
|
||||
for item in response.xpath('//*[@id="__nuxt"]/div/div[5]/div/div[2]/div[1]/div[1]/div[2]/ul/li/a'):
|
||||
name = item.xpath('text()').get()
|
||||
if '散装进口沥青到岸价' in name:
|
||||
print(name, 'http://www.baiinfo.com{}'.format(item.xpath('@href').get()))
|
||||
yield Request(
|
||||
method='GET',
|
||||
url='http://www.baiinfo.com{}'.format(item.xpath('@href').get()),
|
||||
headers={
|
||||
'User-Agent': self.user_agent,
|
||||
},
|
||||
cookies=self.cookie,
|
||||
meta={'source': name, 'type': 'list_page'}
|
||||
)
|
||||
|
||||
def parse_list_page(self, response):
|
||||
date = response.xpath('//*[@id="__nuxt"]/div/div[5]/div/div[2]/div[1]/div[2]/p[1]/span[4]/text()').get()
|
||||
date = date.strip('日').replace('年', '-').replace('月', '-')
|
||||
|
||||
for item in response.xpath('//*[@id="__nuxt"]/div/div[5]/div/div[2]/div[1]/div[3]/div/div/table/tbody/tr'):
|
||||
line = [cell.xpath('text()').get() for cell in item.xpath('td/span/span/span/span/span')][:7]
|
||||
print(line)
|
||||
if line[-1] == '备注' or '品质' in line[0]:
|
||||
continue
|
||||
name, *_, price, fluctuating = line
|
||||
yield {
|
||||
'name': name,
|
||||
'date': date,
|
||||
'price': int(price.split('-')[-1]),
|
||||
}
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import json
|
||||
|
||||
from spiders import run_spider, BaiinfoCookieTools
|
||||
from commons.models.asphalt_imported import AsphaltImported
|
||||
from core.factory import ClientApp
|
||||
|
||||
# cookie 读取
|
||||
cookie = BaiinfoCookieTools.get_cookies()
|
||||
# 爬取
|
||||
AsphaltImportedSpider.cookie = cookie
|
||||
AsphaltImportedSpider.user_agent = BaiinfoCookieTools.user_agent
|
||||
file_path = run_spider(AsphaltImportedSpider)
|
||||
# 入库
|
||||
data = json.loads(open(file_path, 'r', encoding='utf-8').read())
|
||||
with ClientApp().app_context():
|
||||
for item in data:
|
||||
print(item)
|
||||
AsphaltImported(**item).upsert()
|
96
web/spiders/cement.py
Normal file
96
web/spiders/cement.py
Normal file
@@ -0,0 +1,96 @@
|
||||
import scrapy
|
||||
from scrapy import Request
|
||||
|
||||
|
||||
class CementSpider(scrapy.Spider):
|
||||
name = "cement"
|
||||
start_urls = [
|
||||
('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/1"),
|
||||
('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/2"),
|
||||
('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/3"),
|
||||
('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/4"),
|
||||
('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/5"),
|
||||
('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/6"),
|
||||
('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/7"),
|
||||
('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/8"),
|
||||
('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/9"),
|
||||
('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/10"),
|
||||
]
|
||||
cookie = None
|
||||
user_agent = None
|
||||
|
||||
def start_requests(self):
|
||||
for source, url in self.start_urls:
|
||||
yield Request(
|
||||
method='GET',
|
||||
url=url,
|
||||
headers={
|
||||
'User-Agent': self.user_agent,
|
||||
},
|
||||
cookies=self.cookie,
|
||||
meta={'source': source, 'type': 'home'}
|
||||
)
|
||||
|
||||
def parse(self, response, **kwargs):
|
||||
if response.meta['type'] == 'home':
|
||||
yield from self.parse_home(response)
|
||||
elif response.meta['type'] == 'list_page':
|
||||
yield from self.parse_list_page(response)
|
||||
|
||||
def parse_home(self, response):
|
||||
for item in response.xpath('//*[@id="__nuxt"]/div/div[5]/div/div[2]/div[1]/div[1]/div[2]/ul/li/a'):
|
||||
title = item.xpath('text()').get()
|
||||
if '福建水泥市场参考价格' in title:
|
||||
print(title, 'http://www.baiinfo.com{}'.format(item.xpath('@href').get()))
|
||||
yield Request(
|
||||
method='GET',
|
||||
url='http://www.baiinfo.com{}'.format(item.xpath('@href').get()),
|
||||
headers={
|
||||
'User-Agent': self.user_agent,
|
||||
},
|
||||
cookies=self.cookie,
|
||||
meta={'source': title, 'type': 'list_page'}
|
||||
)
|
||||
|
||||
def parse_list_page(self, response):
|
||||
date = response.xpath('//*[@id="__nuxt"]/div/div[5]/div/div[2]/div[1]/div[2]/p[1]/span[4]/text()').get()
|
||||
date = date.strip('日').replace('年', '-').replace('月', '-')
|
||||
|
||||
for item in response.xpath('//tr[position()>2]'):
|
||||
block_1 = [cell.get() for cell in item.xpath('td/span/span/span/span/span/text()') if cell.get()]
|
||||
price, *_ = block_1
|
||||
|
||||
block_2 = [cell.get() for cell in item.xpath('td/span/span/span/span/span/span/text()') if cell.get()]
|
||||
spec, name, pack, source, _, fluctuating = block_2
|
||||
|
||||
yield {
|
||||
'name': name,
|
||||
'price': price,
|
||||
'spec': spec,
|
||||
'pack': pack,
|
||||
'date': date,
|
||||
'source': source,
|
||||
'fluctuating': int(fluctuating)
|
||||
}
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import json
|
||||
|
||||
from spiders import run_spider, BaiinfoCookieTools
|
||||
from commons.models.cement import Cement
|
||||
from core.factory import ClientApp
|
||||
|
||||
# cookie 读取
|
||||
cookie = BaiinfoCookieTools.get_cookies()
|
||||
# 爬取
|
||||
CementSpider.cookie = cookie
|
||||
CementSpider.user_agent = BaiinfoCookieTools.user_agent
|
||||
file_path = run_spider(CementSpider)
|
||||
# 入库
|
||||
data = json.loads(open(file_path, 'r', encoding='utf-8').read())
|
||||
with ClientApp().app_context():
|
||||
for item in data:
|
||||
print(item)
|
||||
Cement(**item).upsert()
|
||||
|
91
web/spiders/data_fujian.py
Normal file
91
web/spiders/data_fujian.py
Normal file
@@ -0,0 +1,91 @@
|
||||
import datetime
|
||||
from dateutil.relativedelta import relativedelta
|
||||
|
||||
import scrapy
|
||||
from scrapy import Request
|
||||
|
||||
|
||||
CITY_ID = [
|
||||
('93', '福州市'),
|
||||
('94', '厦门市'),
|
||||
('95', '宁德市'),
|
||||
('96', '莆田市'),
|
||||
('97', '泉州市'),
|
||||
('98', '漳州市'),
|
||||
('99', '龙岩市'),
|
||||
('100', '三明市'),
|
||||
('101', '南平市'),
|
||||
('102', '平潭综合实验区'),
|
||||
]
|
||||
|
||||
CLASS_CODE = [
|
||||
('01', '01黑色及有色金属'),
|
||||
('04', '04水泥、砖瓦灰砂'),
|
||||
('05', '05木、竹材料及其制品'),
|
||||
('13', '13涂料及防腐、防水材料'),
|
||||
('14', '14油品、化工原料'),
|
||||
]
|
||||
|
||||
URL = 'http://49.4.85.126/Information/Index?qClassCode={class_code}&qMatType=0&WayID=14&WayID2=4&CityID=7&CityID2={city_id}&Year={year}&Month={month}&Week=0&Day=0&qKeyWord='
|
||||
MONTHS = 2 # 爬取最近月份数量
|
||||
|
||||
|
||||
class DataFujianSpider(scrapy.Spider):
|
||||
name = "data_fujian"
|
||||
|
||||
def start_requests(self):
|
||||
for city_id, city_name in CITY_ID:
|
||||
for class_code, source in CLASS_CODE:
|
||||
for month in range(1, 1 + MONTHS):
|
||||
date = datetime.date.today() - relativedelta(months=month)
|
||||
yield Request(
|
||||
method='GET',
|
||||
url=URL.format(year=date.year, month=date.month, class_code=class_code, city_id=city_id),
|
||||
headers={
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
|
||||
},
|
||||
meta={'source': source, 'type': 'home', 'city': city_name, 'month': date.month, 'year': date.year}
|
||||
)
|
||||
|
||||
def parse(self, response, **kwargs):
|
||||
for item in response.xpath('//*[@id="searcList"]/div/div[2]/div[3]/div[3]/table/tr'):
|
||||
block_1 = [i.xpath('text()').get().strip() for i in item.xpath('td')]
|
||||
print(block_1)
|
||||
if not block_1:
|
||||
continue
|
||||
number, _, type_, unit, price, price_with_tax, *_ = block_1
|
||||
|
||||
block_2 = [i.xpath('text()').get().strip() for i in item.xpath('td/span')]
|
||||
print(block_2)
|
||||
name, *_ = block_2
|
||||
|
||||
yield {
|
||||
'number': number,
|
||||
'name': name,
|
||||
'spec': type_,
|
||||
'unit': unit,
|
||||
'price_without_tax': price,
|
||||
'price': price_with_tax,
|
||||
'category': response.meta['source'],
|
||||
'year': response.meta['year'],
|
||||
'month': response.meta['month'],
|
||||
'city': response.meta['city'],
|
||||
'date': datetime.date.today().strftime('%Y-%m-%d')
|
||||
}
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import json
|
||||
|
||||
from spiders import run_spider
|
||||
from commons.models.data_fujian import DataFujian
|
||||
from core.factory import ClientApp
|
||||
|
||||
# 爬取
|
||||
file_path = run_spider(DataFujianSpider)
|
||||
# 入库
|
||||
data = json.loads(open(file_path, 'r', encoding='utf-8').read())
|
||||
with ClientApp().app_context():
|
||||
for item in data:
|
||||
print(item)
|
||||
DataFujian(**item).upsert()
|
78
web/spiders/data_guangdong.py
Normal file
78
web/spiders/data_guangdong.py
Normal file
@@ -0,0 +1,78 @@
|
||||
import scrapy
|
||||
from scrapy import Request
|
||||
|
||||
|
||||
class DataGuangdongSpider(scrapy.Spider):
|
||||
name = "data_guangdong"
|
||||
start_urls = [
|
||||
('材料信息价', "http://zjz.gdcd.gov.cn/zjzgdcd/zjxx_clxxj/list.shtml"),
|
||||
('材料信息价', "http://zjz.gdcd.gov.cn/zjzgdcd/zjxx_clxxj/list_2.shtml"),
|
||||
('材料信息价', "http://zjz.gdcd.gov.cn/zjzgdcd/zjxx_clxxj/list_3.shtml"),
|
||||
('材料信息价', "http://zjz.gdcd.gov.cn/zjzgdcd/zjxx_clxxj/list_4.shtml"),
|
||||
('材料信息价', "http://zjz.gdcd.gov.cn/zjzgdcd/zjxx_clxxj/list_5.shtml"),
|
||||
]
|
||||
|
||||
def start_requests(self):
|
||||
for source, url in self.start_urls:
|
||||
yield Request(
|
||||
method='GET',
|
||||
url=url,
|
||||
headers={
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
|
||||
},
|
||||
meta={'source': source, 'type': 'home'}
|
||||
)
|
||||
|
||||
def parse(self, response, **kwargs):
|
||||
if response.meta['type'] == 'home':
|
||||
yield from self.parse_home(response)
|
||||
elif response.meta['type'] == 'list':
|
||||
yield from self.parse_list(response)
|
||||
|
||||
def parse_home(self, response):
|
||||
for item in response.xpath('/html/body/div/div[3]/div[2]/div[2]/div/ul/li/a'):
|
||||
uri = item.xpath('@href').get()
|
||||
name = item.xpath('text()').get()
|
||||
if '广东省交通建设工程主要外购材料信息价表' not in name:
|
||||
continue
|
||||
yield Request(
|
||||
method='GET',
|
||||
url=f'http://zjz.gdcd.gov.cn{uri}',
|
||||
headers={
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
|
||||
},
|
||||
meta={'source': name, 'type': 'list'}
|
||||
)
|
||||
|
||||
def parse_list(self, response):
|
||||
date = response.xpath('/html/body/div[1]/div[4]/div/div[1]/div[2]/span[2]/text()').get().strip('发布时间:')
|
||||
source = response.xpath('/html/body/div[1]/div[4]/div/div[1]/div[2]/span[1]/b/text()').get()
|
||||
for item in response.xpath('//*[@id="zoomcon"]/p/a'):
|
||||
uri = item.xpath('@href').get()
|
||||
name = item.xpath('text()').get()
|
||||
url_prefix = '/'.join(response.url.split('/')[:-1])
|
||||
print(uri, name)
|
||||
yield {
|
||||
'url': f'{url_prefix}/{uri}',
|
||||
'name': name,
|
||||
# 'source': response.meta['source'],
|
||||
'source': source,
|
||||
'date': date
|
||||
}
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import json
|
||||
|
||||
from spiders import run_spider
|
||||
from commons.models.data_guangdong import DataGuangdong
|
||||
from core.factory import ClientApp
|
||||
|
||||
# 爬取
|
||||
file_path = run_spider(DataGuangdongSpider)
|
||||
# 入库
|
||||
data = json.loads(open(file_path, 'r', encoding='utf-8').read())
|
||||
with ClientApp().app_context():
|
||||
for item in data:
|
||||
print(item)
|
||||
DataGuangdong(**item).upsert()
|
84
web/spiders/date_zhejiang.py
Normal file
84
web/spiders/date_zhejiang.py
Normal file
@@ -0,0 +1,84 @@
|
||||
import scrapy
|
||||
from lxml import html, etree
|
||||
from scrapy import Request
|
||||
|
||||
|
||||
class DataZhejiangSpider(scrapy.Spider):
|
||||
name = "data_zhejiang"
|
||||
start_urls = [
|
||||
('材料价格', "http://jtyst.zj.gov.cn/col/col1228999576/index.html"),
|
||||
('材料价格', "http://jtyst.zj.gov.cn/col/col1228999576/index.html?uid=5509220&pageNum=2"),
|
||||
('材料价格', "http://jtyst.zj.gov.cn/col/col1228999576/index.html?uid=5509220&pageNum=3"),
|
||||
('材料价格', "http://jtyst.zj.gov.cn/col/col1228999576/index.html?uid=5509220&pageNum=4"),
|
||||
('材料价格', "http://jtyst.zj.gov.cn/col/col1228999576/index.html?uid=5509220&pageNum=5"),
|
||||
]
|
||||
|
||||
def start_requests(self):
|
||||
for source, url in self.start_urls:
|
||||
yield Request(
|
||||
method='GET',
|
||||
url=url,
|
||||
headers={
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
|
||||
},
|
||||
meta={'source': source, 'type': 'home'}
|
||||
)
|
||||
|
||||
def parse(self, response, **kwargs):
|
||||
if response.meta['type'] == 'home':
|
||||
yield from self.parse_home(response)
|
||||
elif response.meta['type'] == 'list':
|
||||
yield from self.parse_list(response)
|
||||
|
||||
def parse_home(self, response):
|
||||
rsp = response.xpath('//*[@id="5509220"]/script/text()').get()
|
||||
for t in ('<![CDATA[', ']]>', '</record>', '<record>', '</recordset>', '<recordset>', '</datastore>', '<datastore>', '</nextgroup>', '<nextgroup>'):
|
||||
rsp = rsp.replace(t, '')
|
||||
html = etree.HTML(rsp)
|
||||
for item in html.xpath('//li/a'):
|
||||
print(item)
|
||||
uri = item.xpath('@href')[0]
|
||||
name = item.xpath('text()')[0]
|
||||
print(uri, name)
|
||||
if '《质监与造价》价格信息专辑' not in name:
|
||||
continue
|
||||
yield Request(
|
||||
method='GET',
|
||||
url=f'http://jtyst.zj.gov.cn{uri}',
|
||||
headers={
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
|
||||
},
|
||||
meta={'source': name, 'type': 'list'}
|
||||
)
|
||||
|
||||
def parse_list(self, response):
|
||||
date = response.xpath('/html/body/div[2]/div[1]/div[2]/div[1]/p/span[1]/text()').get().split('日')[0].replace('年', '-').replace('月', '-')
|
||||
source = response.xpath('/html/body/div[2]/div[1]/div[2]/div[1]/p/span[2]/text()[2]').get()
|
||||
for item in response.xpath('//*[@id="zoom"]/p/a'):
|
||||
uri = item.xpath('@href').get()
|
||||
name = item.xpath('text()').get()
|
||||
print(uri, name)
|
||||
yield {
|
||||
'url': f'https://jtyst.zj.gov.cn{uri}',
|
||||
'name': name,
|
||||
# 'source': response.meta['source']
|
||||
'source': source,
|
||||
'date': date,
|
||||
}
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import json
|
||||
|
||||
from spiders import run_spider
|
||||
from commons.models.data_zhejiang import DataZhejiang
|
||||
from core.factory import ClientApp
|
||||
|
||||
# 爬取
|
||||
file_path = run_spider(DataZhejiangSpider)
|
||||
# 入库
|
||||
data = json.loads(open(file_path, 'r', encoding='utf-8').read())
|
||||
with ClientApp().app_context():
|
||||
for item in data:
|
||||
print(item)
|
||||
DataZhejiang(**item).upsert()
|
83
web/spiders/oil.py
Normal file
83
web/spiders/oil.py
Normal file
@@ -0,0 +1,83 @@
|
||||
import json
|
||||
|
||||
import scrapy
|
||||
from scrapy import Request
|
||||
|
||||
|
||||
class OilSpider(scrapy.Spider):
|
||||
name = "oil"
|
||||
start_urls = [
|
||||
('成品油价格调整', "https://fgw.fujian.gov.cn/was5/web/search?channelid=217025&templet=advsch.jsp&sortfield=-docreltime&classsql=%25%E6%88%90%E5%93%81%E6%B2%B9%E4%BB%B7%E6%A0%BC%E8%B0%83%E6%95%B4%25*siteid%3D31*siteid%3D31&prepage=100&page=1"),
|
||||
]
|
||||
|
||||
def start_requests(self):
|
||||
for source, url in self.start_urls:
|
||||
yield Request(
|
||||
method='GET',
|
||||
url=url,
|
||||
headers={
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
|
||||
},
|
||||
meta={'source': source, 'type': 'list'}
|
||||
)
|
||||
|
||||
def parse(self, response, **kwargs):
|
||||
if response.meta['type'] == 'list':
|
||||
yield from self.parse_list(response)
|
||||
elif response.meta['type'] == 'detail':
|
||||
yield from self.parse_detail(response)
|
||||
|
||||
def parse_list(self, response):
|
||||
ret = json.loads(response.text.replace('\n', ''))
|
||||
if ret.get('count') and ret.get('docs'):
|
||||
for item in ret['docs']:
|
||||
if not item.get('title2'):
|
||||
continue
|
||||
print(f"{item['title2']} {item['pubtime']}")
|
||||
yield Request(
|
||||
method='GET',
|
||||
url=item['url'],
|
||||
meta={'source': f"{item['title2']} {item['pubtime']}", 'type': 'detail', 'time': item['pubtime']}
|
||||
)
|
||||
|
||||
def parse_detail(self, response):
|
||||
for item in response.xpath('//table[1]/tbody/tr'):
|
||||
if len([i.get() for i in item.xpath('td/span/text()')]) > 0:
|
||||
first_word = item.xpath('td/span/text()').get()
|
||||
print()
|
||||
if first_word.strip() == '油品' or first_word.strip() == '元/吨':
|
||||
continue
|
||||
name = first_word
|
||||
price, *_ = [i.get() for i in item.xpath('td/text()')]
|
||||
yield {
|
||||
'name': name,
|
||||
'price': int(price),
|
||||
'date': response.meta['time'].split(' ')[0],
|
||||
}
|
||||
elif len([i.get() for i in item.xpath('td/text()')]) > 0:
|
||||
first_word = item.xpath('td/text()').get()
|
||||
if first_word.strip() == '油品' or first_word.strip() == '元/吨' or first_word.startswith('\xa0') or first_word.startswith('\n'):
|
||||
continue
|
||||
name, price, *_ = [i.get() for i in item.xpath('td/text()')]
|
||||
yield {
|
||||
'name': name,
|
||||
'price': price,
|
||||
'date': response.meta['time'].split(' ')[0],
|
||||
}
|
||||
else:
|
||||
print()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
from spiders import run_spider
|
||||
from commons.models.oil import Oil
|
||||
from core.factory import ClientApp
|
||||
|
||||
# 爬取
|
||||
file_path = run_spider(OilSpider)
|
||||
# 入库
|
||||
data = json.loads(open(file_path, 'r', encoding='utf-8').read())
|
||||
with ClientApp().app_context():
|
||||
for item in data:
|
||||
print(item)
|
||||
Oil(**item).upsert()
|
89
web/spiders/steel_plate.py
Normal file
89
web/spiders/steel_plate.py
Normal file
@@ -0,0 +1,89 @@
|
||||
import scrapy
|
||||
from scrapy import Request
|
||||
|
||||
from commons.constants.mysteel import PageType
|
||||
|
||||
|
||||
class SteelPlateSpider(scrapy.Spider):
|
||||
name = "steel_plate"
|
||||
start_urls = [
|
||||
(PageType.PLATE_LIST, "https://list1.mysteel.com/market/p-219-----010102-0-01010502-------1.html"),
|
||||
]
|
||||
cookie = None
|
||||
user_agent = None
|
||||
|
||||
def start_requests(self):
|
||||
for source, url in self.start_urls:
|
||||
yield Request(
|
||||
method='GET',
|
||||
url=url,
|
||||
headers={
|
||||
'User-Agent': self.user_agent,
|
||||
},
|
||||
meta={'source': source, 'type': source}
|
||||
)
|
||||
|
||||
def parse(self, response, **kwargs):
|
||||
if response.meta['type'] == PageType.PLATE_LIST:
|
||||
yield from self.parse_board_list(response)
|
||||
if response.meta['type'] == PageType.PLATE_DETAIL:
|
||||
yield from self.parse_board_detail(response)
|
||||
|
||||
def parse_board_list(self, response):
|
||||
for item in response.xpath('//*[@id="articleList"]/ul/li/a'):
|
||||
uri = item.xpath('@href').get()
|
||||
name = item.xpath('text()').get()
|
||||
print(uri, name)
|
||||
if '福州市场中厚板价格行情' not in name:
|
||||
continue
|
||||
yield Request(
|
||||
method='GET',
|
||||
url=uri,
|
||||
headers={
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
|
||||
},
|
||||
cookies=self.cookie,
|
||||
meta={'source': name, 'type': PageType.PLATE_DETAIL}
|
||||
)
|
||||
|
||||
def parse_board_detail(self, response):
|
||||
title = response.xpath('//*[@id="content-title"]/text()').get()
|
||||
date = title.split('日')[0].replace('年', '-').replace('月', '-')
|
||||
|
||||
for item in response.xpath('//*[@id="marketTable"]/tr[position()>2]'):
|
||||
line = [cell.xpath('text()').get().strip() for cell in item.xpath('td')]
|
||||
print(line)
|
||||
if len(line) < 7:
|
||||
continue
|
||||
name, spec, material, source, price, fluctuating, *_ = line
|
||||
yield {
|
||||
'name': name,
|
||||
'spec': spec,
|
||||
'material': material,
|
||||
'source': source,
|
||||
'price': int(price),
|
||||
'fluctuating': 0 if fluctuating == '-' else int(fluctuating),
|
||||
'date': date
|
||||
}
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import json
|
||||
|
||||
from spiders import run_spider, MysteelCookieTools
|
||||
from commons.models.steel_plate import SteelPlate
|
||||
from core.factory import ClientApp
|
||||
|
||||
# cookie 读取
|
||||
cookie = MysteelCookieTools.get_cookies()
|
||||
# 爬取
|
||||
SteelPlateSpider.cookie = cookie
|
||||
SteelPlateSpider.user_agent = MysteelCookieTools.user_agent
|
||||
file_path = run_spider(SteelPlateSpider)
|
||||
# 入库
|
||||
data = json.loads(open(file_path, 'r', encoding='utf-8').read())
|
||||
with ClientApp().app_context():
|
||||
for item in data:
|
||||
print(item)
|
||||
SteelPlate(**item).upsert()
|
||||
|
90
web/spiders/steel_rebar.py
Normal file
90
web/spiders/steel_rebar.py
Normal file
@@ -0,0 +1,90 @@
|
||||
import scrapy
|
||||
from scrapy import Request
|
||||
|
||||
from price.constants.mysteel import PageType
|
||||
|
||||
|
||||
class SteelRebarSpider(scrapy.Spider):
|
||||
name = "steel_rebar"
|
||||
start_urls = [
|
||||
(PageType.REBAR_LIST, "https://jiancai.mysteel.com/market/pa228aa010101a0a01010502aaaa1.html"),
|
||||
]
|
||||
cookie = None
|
||||
user_agent = None
|
||||
|
||||
def start_requests(self):
|
||||
for source, url in self.start_urls:
|
||||
yield Request(
|
||||
method='GET',
|
||||
url=url,
|
||||
headers={
|
||||
'User-Agent': self.user_agent,
|
||||
},
|
||||
meta={'source': source, 'type': source}
|
||||
)
|
||||
|
||||
def parse(self, response, **kwargs):
|
||||
if response.meta['type'] == PageType.REBAR_LIST:
|
||||
yield from self.parse_steel_list(response)
|
||||
elif response.meta['type'] == PageType.REBAR_DETAIL:
|
||||
yield from self.parse_steel_detail(response)
|
||||
|
||||
def parse_steel_list(self, response):
|
||||
for item in response.xpath('//*[@id="articleList"]/ul/li/a'):
|
||||
uri = item.xpath('@href').get()
|
||||
name = item.xpath('text()').get()
|
||||
print(uri, name)
|
||||
if ')福州市场建筑钢材价格行情' not in name:
|
||||
continue
|
||||
yield Request(
|
||||
method='GET',
|
||||
url=uri,
|
||||
headers={
|
||||
'User-Agent': self.user_agent,
|
||||
},
|
||||
cookies=self.cookie,
|
||||
meta={'source': name, 'type': PageType.REBAR_DETAIL}
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def parse_steel_detail(response):
|
||||
# 解析日期
|
||||
title = response.xpath('//*[@id="content-title"]/text()').get()
|
||||
date = title.split('日')[0].replace('年', '-').replace('月', '-')
|
||||
#
|
||||
for item in response.xpath('//*[@id="marketTable"]/tr[position()>2]'):
|
||||
line = [cell.xpath('text()').get().strip() for cell in item.xpath('td')]
|
||||
print(line)
|
||||
if len(line) < 8:
|
||||
continue
|
||||
name, spec, material, source, price, fluctuating, *_ = line
|
||||
yield {
|
||||
'name': name,
|
||||
'spec': spec,
|
||||
'material': material,
|
||||
'source': source,
|
||||
'price': int(price),
|
||||
'fluctuating': 0 if fluctuating == '-' else int(fluctuating),
|
||||
'date': date
|
||||
}
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import json
|
||||
|
||||
from spiders import run_spider, MysteelCookieTools
|
||||
from commons.models.steel_rebar import SteelRebar
|
||||
from core.factory import ClientApp
|
||||
|
||||
# cookie 读取
|
||||
cookie = MysteelCookieTools.get_cookies()
|
||||
# 爬取
|
||||
SteelRebarSpider.cookie = cookie
|
||||
SteelRebarSpider.user_agent = MysteelCookieTools.user_agent
|
||||
file_path = run_spider(SteelRebarSpider)
|
||||
# 入库
|
||||
data = json.loads(open(file_path, 'r', encoding='utf-8').read())
|
||||
with ClientApp().app_context():
|
||||
for item in data:
|
||||
print(item)
|
||||
SteelRebar(**item).upsert()
|
88
web/spiders/steel_section.py
Normal file
88
web/spiders/steel_section.py
Normal file
@@ -0,0 +1,88 @@
|
||||
import scrapy
|
||||
from scrapy import Request
|
||||
|
||||
from commons.constants.mysteel import PageType
|
||||
|
||||
|
||||
class SteelSectionSpider(scrapy.Spider):
|
||||
name = "steel_section"
|
||||
start_urls = [
|
||||
(PageType.SECTION_LIST, "https://list1.mysteel.com/market/p-227-----010107-0-01010502-------1.html"),
|
||||
]
|
||||
cookie = None
|
||||
user_agent = None
|
||||
|
||||
def start_requests(self):
|
||||
for source, url in self.start_urls:
|
||||
yield Request(
|
||||
method='GET',
|
||||
url=url,
|
||||
headers={
|
||||
'User-Agent': self.user_agent,
|
||||
},
|
||||
meta={'source': source, 'type': source}
|
||||
)
|
||||
|
||||
def parse(self, response, **kwargs):
|
||||
if response.meta['type'] == PageType.SECTION_LIST:
|
||||
yield from self.parse_section_list(response)
|
||||
if response.meta['type'] == PageType.SECTION_DETAIL:
|
||||
yield from self.parse_section_detail(response)
|
||||
|
||||
def parse_section_list(self, response):
|
||||
for item in response.xpath('//*[@id="articleList"]/ul/li/a'):
|
||||
uri = item.xpath('@href').get()
|
||||
name = item.xpath('text()').get()
|
||||
print(uri, name)
|
||||
if '福州市场工角槽钢价格行情' not in name:
|
||||
continue
|
||||
yield Request(
|
||||
method='GET',
|
||||
url=uri,
|
||||
headers={
|
||||
'User-Agent': self.user_agent,
|
||||
},
|
||||
cookies=self.cookie,
|
||||
meta={'source': name, 'type': PageType.SECTION_DETAIL}
|
||||
)
|
||||
|
||||
def parse_section_detail(self, response):
|
||||
title = response.xpath('//*[@id="content-title"]/text()').get()
|
||||
date = title.split('日')[0].replace('年', '-').replace('月', '-')
|
||||
|
||||
for item in response.xpath('//*[@id="marketTable"]/tr[position()>2]'):
|
||||
line = [cell.xpath('text()').get().strip() for cell in item.xpath('td')]
|
||||
print(line)
|
||||
if len(line) < 7:
|
||||
continue
|
||||
name, spec, material, source, price, fluctuating, *_ = line
|
||||
yield {
|
||||
'name': name,
|
||||
'spec': spec,
|
||||
'material': material,
|
||||
'source': source,
|
||||
'price': int(price),
|
||||
'fluctuating': 0 if fluctuating == '-' else int(fluctuating),
|
||||
'date': date
|
||||
}
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import json
|
||||
|
||||
from spiders import run_spider, MysteelCookieTools
|
||||
from commons.models.steel_section import SteelSection
|
||||
from core.factory import ClientApp
|
||||
|
||||
# cookie 读取
|
||||
cookie = MysteelCookieTools.get_cookies()
|
||||
# 爬取
|
||||
SteelSectionSpider.cookie = cookie
|
||||
SteelSectionSpider.user_agent = MysteelCookieTools.user_agent
|
||||
file_path = run_spider(SteelSectionSpider)
|
||||
# 入库
|
||||
data = json.loads(open(file_path, 'r', encoding='utf-8').read())
|
||||
with ClientApp().app_context():
|
||||
for item in data:
|
||||
print(item)
|
||||
SteelSection(**item).upsert()
|
Reference in New Issue
Block a user