import json import logging import pathlib from urllib.parse import unquote import requests from lxml import etree from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings from utils.login import login_mysteel, login_baiinfo def run_spider(spider): filename = f'{spider.name}.json' settings = get_project_settings() settings.set('FEEDS', { f'{filename}': { 'format': 'json', 'encoding': 'utf8', 'overwrite': True, }, }) process = CrawlerProcess(settings) process.crawl(spider) process.start() return pathlib.Path(filename) # .absolute() class CookieTools: user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36' class MysteelCookieTools(CookieTools): @staticmethod def is_valid(cookies): result = requests.get('https://e.mysteel.com/account_index.htm', cookies=cookies) result.encoding = result.apparent_encoding tree = etree.HTML(result.text) flag_tag = tree.xpath('/html/body/div[1]/div[1]/p[2]/text()') print(flag_tag) if len(flag_tag) > 0 and flag_tag[0] == '会员登录': logging.warning('Mysteel.com Cookies 无效或已过期 | 强制跳转至登陆页') return False flag_tag = tree.xpath('//*[@id="top"]/div/span[2]/a/text()') print(flag_tag) if len(flag_tag) > 0 and flag_tag[0] == '退出登录': pass else: logging.warning('Mysteel.com Cookies 无效或已过期 | 无法正确进入个人中心') return False logging.warning('Mysteel.com Cookies 验证成功 | 成功进入个人中心') return True @staticmethod def save_as_json(raw_cookies, file_path): try: login_token = [i for i in raw_cookies if i['name'] == '_login_token'][0].get('value') session = [i for i in raw_cookies if i['name'] == '_MSPASS_SESSION'][0].get('value') text = json.dumps({ '_login_token': login_token, login_token: '1=10', '_MSPASS_SESSION': session }) with open(file_path, 'w') as f: f.write(text) return file_path except IndexError: logging.warning('保存失败 | 无法正确解析原始 cookies') @staticmethod def read_from_json(file_path): cookies = json.loads(open(file_path, 'r', encoding='utf-8').read()) return cookies @classmethod def get_cookies(cls, file_path=r'E:\Project\item_spider\mysteel.cookies.json'): cookie_json_file_path = file_path cookie = cls.read_from_json(cookie_json_file_path) if not cls.is_valid(cookie): raw_cookies = login_mysteel() cls.save_as_json(raw_cookies, cookie_json_file_path) cookie = cls.read_from_json(cookie_json_file_path) return cookie class BaiinfoCookieTools(CookieTools): @staticmethod def is_valid(cookies): if not cookies: return False result = requests.post( 'http://www.baiinfo.com/api/website/price/priceInfo/getPriceList', json={"channelId": "18", "pricesGroupId": 526}, # cookies=cookies, headers={ 'Baiinfo-Auth': json.loads(cookies['user'])['token'], 'User-Agent': CookieTools.user_agent } ) flag = json.loads(result.text) if flag['code'] != 200: logging.warning(f'Baiinfo.com Token 无效或已过期 | {flag["msg"]}') return False result = requests.get('http://www.baiinfo.com/news/newscategory/4710/99/3', cookies=cookies, headers={'User-Agent': 'PostmanRuntime/7.26.8'}) # result.encoding = result.apparent_encoding tree = etree.HTML(result.text) flag_tag = tree.xpath('//head/title/text()') print(flag_tag) if len(flag_tag) > 0 and '用户登录' in flag_tag[0]: logging.warning('Baiinfo.com Cookies 无效或已过期 | 强制跳转至登陆页') return False flag_tag = tree.xpath('//head/title/text()') print(flag_tag) if len(flag_tag) > 0 and '水泥价格(华东) - 百川盈孚' in flag_tag[0]: pass else: logging.warning('Baiinfo.com Cookies 无效或已过期 | 无法正确进入鉴权页面') return False logging.warning('Baiinfo.com Cookies 验证成功 | 成功进入个鉴权页面') return True @staticmethod def read_from_json(file_path): raw_cookies = json.loads(open(file_path, 'r', encoding='utf-8').read() or '[]') user_cookie = [i for i in raw_cookies if i['name'] == 'user'] if not user_cookie: return None user = user_cookie[0].get('value') user = unquote(user) return {'user': user} @staticmethod def save_as_json(raw_cookies, file_path): try: with open(file_path, 'w') as f: f.write(json.dumps(raw_cookies)) return file_path except IndexError: logging.warning('保存失败 | 无法正确解析原始 cookies') @classmethod def get_cookies(cls, file_path=r'E:\Project\item_spider\baiinfo.cookies.json'): cookie_json_file_path = file_path cookie = cls.read_from_json(cookie_json_file_path) if not cls.is_valid(cookie): raw_cookies = login_baiinfo() cls.save_as_json(raw_cookies, cookie_json_file_path) cookie = cls.read_from_json(cookie_json_file_path) return cookie