159 lines
		
	
	
		
			5.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			159 lines
		
	
	
		
			5.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import json
 | |
| import logging
 | |
| import pathlib
 | |
| from urllib.parse import unquote
 | |
| 
 | |
| import requests
 | |
| from lxml import etree
 | |
| from scrapy.crawler import CrawlerProcess
 | |
| from scrapy.utils.project import get_project_settings
 | |
| 
 | |
| from utils.login import login_mysteel, login_baiinfo
 | |
| 
 | |
| 
 | |
| def run_spider(spider):
 | |
|     filename = f'{spider.name}.json'
 | |
|     settings = get_project_settings()
 | |
|     settings.set('FEEDS', {
 | |
|         f'{filename}': {
 | |
|             'format': 'json',
 | |
|             'encoding': 'utf8',
 | |
|             'overwrite': True,
 | |
|         },
 | |
|     })
 | |
|     process = CrawlerProcess(settings)
 | |
|     process.crawl(spider)
 | |
|     process.start()
 | |
|     return pathlib.Path(filename)  # .absolute()
 | |
| 
 | |
| 
 | |
| class CookieTools:
 | |
|     user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
 | |
| 
 | |
| 
 | |
| class MysteelCookieTools(CookieTools):
 | |
| 
 | |
|     @staticmethod
 | |
|     def is_valid(cookies):
 | |
|         result = requests.get('https://e.mysteel.com/account_index.htm', cookies=cookies)
 | |
|         result.encoding = result.apparent_encoding
 | |
|         tree = etree.HTML(result.text)
 | |
| 
 | |
|         flag_tag = tree.xpath('/html/body/div[1]/div[1]/p[2]/text()')
 | |
|         print(flag_tag)
 | |
|         if len(flag_tag) > 0 and flag_tag[0] == '会员登录':
 | |
|             logging.warning('Mysteel.com Cookies 无效或已过期 | 强制跳转至登陆页')
 | |
|             return False
 | |
| 
 | |
|         flag_tag = tree.xpath('//*[@id="top"]/div/span[2]/a/text()')
 | |
|         print(flag_tag)
 | |
|         if len(flag_tag) > 0 and flag_tag[0] == '退出登录':
 | |
|             pass
 | |
|         else:
 | |
|             logging.warning('Mysteel.com Cookies 无效或已过期 | 无法正确进入个人中心')
 | |
|             return False
 | |
| 
 | |
|         logging.warning('Mysteel.com Cookies 验证成功 | 成功进入个人中心')
 | |
|         return True
 | |
| 
 | |
|     @staticmethod
 | |
|     def save_as_json(raw_cookies, file_path):
 | |
|         try:
 | |
|             login_token = [i for i in raw_cookies if i['name'] == '_login_token'][0].get('value')
 | |
|             session = [i for i in raw_cookies if i['name'] == '_MSPASS_SESSION'][0].get('value')
 | |
|             text = json.dumps({
 | |
|                 '_login_token': login_token,
 | |
|                 login_token: '1=10',
 | |
|                 '_MSPASS_SESSION': session
 | |
|             })
 | |
|             with open(file_path, 'w') as f:
 | |
|                 f.write(text)
 | |
|             return file_path
 | |
|         except IndexError:
 | |
|             logging.warning('保存失败 | 无法正确解析原始 cookies')
 | |
| 
 | |
|     @staticmethod
 | |
|     def read_from_json(file_path):
 | |
|         cookies = json.loads(open(file_path, 'r', encoding='utf-8').read())
 | |
|         return cookies
 | |
| 
 | |
|     @classmethod
 | |
|     def get_cookies(cls, file_path=r'E:\Project\item_spider\mysteel.cookies.json'):
 | |
|         cookie_json_file_path = file_path
 | |
|         cookie = cls.read_from_json(cookie_json_file_path)
 | |
|         if not cls.is_valid(cookie):
 | |
|             raw_cookies = login_mysteel()
 | |
|             cls.save_as_json(raw_cookies, cookie_json_file_path)
 | |
|             cookie = cls.read_from_json(cookie_json_file_path)
 | |
|         return cookie
 | |
| 
 | |
| 
 | |
| class BaiinfoCookieTools(CookieTools):
 | |
|     @staticmethod
 | |
|     def is_valid(cookies):
 | |
|         if not cookies:
 | |
|             return False
 | |
|         result = requests.post(
 | |
|             'http://www.baiinfo.com/api/website/price/priceInfo/getPriceList',
 | |
|             json={"channelId": "18", "pricesGroupId": 526},
 | |
|             # cookies=cookies,
 | |
|             headers={
 | |
|                 'Baiinfo-Auth': json.loads(cookies['user'])['token'],
 | |
|                 'User-Agent': CookieTools.user_agent
 | |
|             }
 | |
|         )
 | |
|         flag = json.loads(result.text)
 | |
|         if flag['code'] != 200:
 | |
|             logging.warning(f'Baiinfo.com Token 无效或已过期 | {flag["msg"]}')
 | |
|             return False
 | |
| 
 | |
|         result = requests.get('http://www.baiinfo.com/news/newscategory/4710/99/3', cookies=cookies, headers={'User-Agent': 'PostmanRuntime/7.26.8'})
 | |
|         # result.encoding = result.apparent_encoding
 | |
|         tree = etree.HTML(result.text)
 | |
| 
 | |
|         flag_tag = tree.xpath('//head/title/text()')
 | |
|         print(flag_tag)
 | |
|         if len(flag_tag) > 0 and '用户登录' in flag_tag[0]:
 | |
|             logging.warning('Baiinfo.com Cookies 无效或已过期 | 强制跳转至登陆页')
 | |
|             return False
 | |
| 
 | |
|         flag_tag = tree.xpath('//head/title/text()')
 | |
|         print(flag_tag)
 | |
|         if len(flag_tag) > 0 and '水泥价格(华东) - 百川盈孚' in flag_tag[0]:
 | |
|             pass
 | |
|         else:
 | |
|             logging.warning('Baiinfo.com Cookies 无效或已过期 | 无法正确进入鉴权页面')
 | |
|             return False
 | |
| 
 | |
|         logging.warning('Baiinfo.com Cookies 验证成功 | 成功进入个鉴权页面')
 | |
|         return True
 | |
| 
 | |
|     @staticmethod
 | |
|     def read_from_json(file_path):
 | |
|         raw_cookies = json.loads(open(file_path, 'r', encoding='utf-8').read() or '[]')
 | |
|         user_cookie = [i for i in raw_cookies if i['name'] == 'user']
 | |
|         if not user_cookie:
 | |
|             return None
 | |
|         user = user_cookie[0].get('value')
 | |
|         user = unquote(user)
 | |
|         return {'user': user}
 | |
| 
 | |
|     @staticmethod
 | |
|     def save_as_json(raw_cookies, file_path):
 | |
|         try:
 | |
|             with open(file_path, 'w') as f:
 | |
|                 f.write(json.dumps(raw_cookies))
 | |
|             return file_path
 | |
|         except IndexError:
 | |
|             logging.warning('保存失败 | 无法正确解析原始 cookies')
 | |
| 
 | |
|     @classmethod
 | |
|     def get_cookies(cls, file_path=r'E:\Project\item_spider\baiinfo.cookies.json'):
 | |
|         cookie_json_file_path = file_path
 | |
|         cookie = cls.read_from_json(cookie_json_file_path)
 | |
|         if not cls.is_valid(cookie):
 | |
|             raw_cookies = login_baiinfo()
 | |
|             cls.save_as_json(raw_cookies, cookie_json_file_path)
 | |
|             cookie = cls.read_from_json(cookie_json_file_path)
 | |
|         return cookie
 | 
