init
This commit is contained in:
157
web/spiders/__init__.py
Normal file
157
web/spiders/__init__.py
Normal file
@@ -0,0 +1,157 @@
|
||||
import json
|
||||
import logging
|
||||
import pathlib
|
||||
from urllib.parse import unquote
|
||||
|
||||
import requests
|
||||
from lxml import etree
|
||||
from scrapy.crawler import CrawlerProcess
|
||||
from scrapy.utils.project import get_project_settings
|
||||
|
||||
from utils.login import login_mysteel, login_baiinfo
|
||||
|
||||
|
||||
def run_spider(spider):
|
||||
filename = f'{spider.name}.json'
|
||||
settings = get_project_settings()
|
||||
settings.set('FEEDS', {
|
||||
f'{filename}': {
|
||||
'format': 'json',
|
||||
'encoding': 'utf8',
|
||||
'overwrite': True,
|
||||
},
|
||||
})
|
||||
process = CrawlerProcess(settings)
|
||||
process.crawl(spider)
|
||||
process.start()
|
||||
return pathlib.Path(filename) # .absolute()
|
||||
|
||||
class CookieTools:
|
||||
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
|
||||
|
||||
|
||||
class MysteelCookieTools(CookieTools):
|
||||
|
||||
@staticmethod
|
||||
def is_valid(cookies):
|
||||
result = requests.get('https://e.mysteel.com/account_index.htm', cookies=cookies)
|
||||
result.encoding = result.apparent_encoding
|
||||
tree = etree.HTML(result.text)
|
||||
|
||||
flag_tag = tree.xpath('/html/body/div[1]/div[1]/p[2]/text()')
|
||||
print(flag_tag)
|
||||
if len(flag_tag) > 0 and flag_tag[0] == '会员登录':
|
||||
logging.warning('Mysteel.com Cookies 无效或已过期 | 强制跳转至登陆页')
|
||||
return False
|
||||
|
||||
flag_tag = tree.xpath('//*[@id="top"]/div/span[2]/a/text()')
|
||||
print(flag_tag)
|
||||
if len(flag_tag) > 0 and flag_tag[0] == '退出登录':
|
||||
pass
|
||||
else:
|
||||
logging.warning('Mysteel.com Cookies 无效或已过期 | 无法正确进入个人中心')
|
||||
return False
|
||||
|
||||
logging.warning('Mysteel.com Cookies 验证成功 | 成功进入个人中心')
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
def save_as_json(raw_cookies, file_path):
|
||||
try:
|
||||
login_token = [i for i in raw_cookies if i['name'] == '_login_token'][0].get('value')
|
||||
session = [i for i in raw_cookies if i['name'] == '_MSPASS_SESSION'][0].get('value')
|
||||
text = json.dumps({
|
||||
'_login_token': login_token,
|
||||
login_token: '1=10',
|
||||
'_MSPASS_SESSION': session
|
||||
})
|
||||
with open(file_path, 'w') as f:
|
||||
f.write(text)
|
||||
return file_path
|
||||
except IndexError:
|
||||
logging.warning('保存失败 | 无法正确解析原始 cookies')
|
||||
|
||||
@staticmethod
|
||||
def read_from_json(file_path):
|
||||
cookies = json.loads(open(file_path, 'r', encoding='utf-8').read())
|
||||
return cookies
|
||||
|
||||
@classmethod
|
||||
def get_cookies(cls, file_path=r'E:\Project\item_spider\mysteel.cookies.json'):
|
||||
cookie_json_file_path = file_path
|
||||
cookie = cls.read_from_json(cookie_json_file_path)
|
||||
if not cls.is_valid(cookie):
|
||||
raw_cookies = login_mysteel()
|
||||
cls.save_as_json(raw_cookies, cookie_json_file_path)
|
||||
cookie = cls.read_from_json(cookie_json_file_path)
|
||||
return cookie
|
||||
|
||||
|
||||
class BaiinfoCookieTools(CookieTools):
|
||||
@staticmethod
|
||||
def is_valid(cookies):
|
||||
if not cookies:
|
||||
return False
|
||||
result = requests.post(
|
||||
'http://www.baiinfo.com/api/website/price/priceInfo/getPriceList',
|
||||
json={"channelId": "18", "pricesGroupId": 526},
|
||||
# cookies=cookies,
|
||||
headers={
|
||||
'Baiinfo-Auth': json.loads(cookies['user'])['token'],
|
||||
'User-Agent': CookieTools.user_agent
|
||||
}
|
||||
)
|
||||
flag = json.loads(result.text)
|
||||
if flag['code'] != 200:
|
||||
logging.warning(f'Baiinfo.com Token 无效或已过期 | {flag["msg"]}')
|
||||
return False
|
||||
|
||||
result = requests.get('http://www.baiinfo.com/news/newscategory/4710/99/3', cookies=cookies, headers={'User-Agent': 'PostmanRuntime/7.26.8'})
|
||||
# result.encoding = result.apparent_encoding
|
||||
tree = etree.HTML(result.text)
|
||||
|
||||
flag_tag = tree.xpath('//head/title/text()')
|
||||
print(flag_tag)
|
||||
if len(flag_tag) > 0 and '用户登录' in flag_tag[0]:
|
||||
logging.warning('Baiinfo.com Cookies 无效或已过期 | 强制跳转至登陆页')
|
||||
return False
|
||||
|
||||
flag_tag = tree.xpath('//head/title/text()')
|
||||
print(flag_tag)
|
||||
if len(flag_tag) > 0 and '水泥价格(华东) - 百川盈孚' in flag_tag[0]:
|
||||
pass
|
||||
else:
|
||||
logging.warning('Baiinfo.com Cookies 无效或已过期 | 无法正确进入鉴权页面')
|
||||
return False
|
||||
|
||||
logging.warning('Baiinfo.com Cookies 验证成功 | 成功进入个鉴权页面')
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
def read_from_json(file_path):
|
||||
raw_cookies = json.loads(open(file_path, 'r', encoding='utf-8').read() or '[]')
|
||||
user_cookie = [i for i in raw_cookies if i['name'] == 'user']
|
||||
if not user_cookie:
|
||||
return None
|
||||
user = user_cookie[0].get('value')
|
||||
user = unquote(user)
|
||||
return {'user': user}
|
||||
|
||||
@staticmethod
|
||||
def save_as_json(raw_cookies, file_path):
|
||||
try:
|
||||
with open(file_path, 'w') as f:
|
||||
f.write(json.dumps(raw_cookies))
|
||||
return file_path
|
||||
except IndexError:
|
||||
logging.warning('保存失败 | 无法正确解析原始 cookies')
|
||||
|
||||
@classmethod
|
||||
def get_cookies(cls, file_path=r'E:\Project\item_spider\baiinfo.cookies.json'):
|
||||
cookie_json_file_path = file_path
|
||||
cookie = cls.read_from_json(cookie_json_file_path)
|
||||
if not cls.is_valid(cookie):
|
||||
raw_cookies = login_baiinfo()
|
||||
cls.save_as_json(raw_cookies, cookie_json_file_path)
|
||||
cookie = cls.read_from_json(cookie_json_file_path)
|
||||
return cookie
|
Reference in New Issue
Block a user