89 lines
		
	
	
		
			3.0 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			89 lines
		
	
	
		
			3.0 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import scrapy
 | |
| from scrapy import Request
 | |
| 
 | |
| from commons.constants.mysteel import PageType
 | |
| 
 | |
| 
 | |
| class SteelSectionSpider(scrapy.Spider):
 | |
|     name = "steel_section"
 | |
|     start_urls = [
 | |
|         (PageType.SECTION_LIST, "https://list1.mysteel.com/market/p-227-----010107-0-01010502-------1.html"),
 | |
|     ]
 | |
|     cookie = None
 | |
|     user_agent = None
 | |
| 
 | |
|     def start_requests(self):
 | |
|         for source, url in self.start_urls:
 | |
|             yield Request(
 | |
|                 method='GET',
 | |
|                 url=url,
 | |
|                 headers={
 | |
|                     'User-Agent': self.user_agent,
 | |
|                 },
 | |
|                 meta={'source': source, 'type': source}
 | |
|             )
 | |
| 
 | |
|     def parse(self, response, **kwargs):
 | |
|         if response.meta['type'] == PageType.SECTION_LIST:
 | |
|             yield from self.parse_section_list(response)
 | |
|         if response.meta['type'] == PageType.SECTION_DETAIL:
 | |
|             yield from self.parse_section_detail(response)
 | |
| 
 | |
|     def parse_section_list(self, response):
 | |
|         for item in response.xpath('//*[@id="articleList"]/ul/li/a'):
 | |
|             uri = item.xpath('@href').get()
 | |
|             name = item.xpath('text()').get()
 | |
|             print(uri, name)
 | |
|             if '福州市场工角槽钢价格行情' not in name:
 | |
|                 continue
 | |
|             yield Request(
 | |
|                 method='GET',
 | |
|                 url=uri,
 | |
|                 headers={
 | |
|                     'User-Agent': self.user_agent,
 | |
|                 },
 | |
|                 cookies=self.cookie,
 | |
|                 meta={'source': name, 'type': PageType.SECTION_DETAIL}
 | |
|             )
 | |
| 
 | |
|     def parse_section_detail(self, response):
 | |
|         title = response.xpath('//*[@id="content-title"]/text()').get()
 | |
|         date = title.split('日')[0].replace('年', '-').replace('月', '-')
 | |
| 
 | |
|         for item in response.xpath('//*[@id="marketTable"]/tr[position()>2]'):
 | |
|             line = [cell.xpath('text()').get().strip() for cell in item.xpath('td')]
 | |
|             print(line)
 | |
|             if len(line) < 7:
 | |
|                 continue
 | |
|             name, spec, material, source, price, fluctuating, *_ = line
 | |
|             yield {
 | |
|                 'name': name,
 | |
|                 'spec': spec,
 | |
|                 'material': material,
 | |
|                 'source': source,
 | |
|                 'price': int(price),
 | |
|                 'fluctuating': 0 if fluctuating == '-' else int(fluctuating),
 | |
|                 'date': date
 | |
|             }
 | |
| 
 | |
| 
 | |
| if __name__ == '__main__':
 | |
|     import json
 | |
| 
 | |
|     from spiders import run_spider, MysteelCookieTools
 | |
|     from commons.models.steel_section import SteelSection
 | |
|     from core.factory import ClientApp
 | |
| 
 | |
|     # cookie 读取
 | |
|     cookie = MysteelCookieTools.get_cookies()
 | |
|     # 爬取
 | |
|     SteelSectionSpider.cookie = cookie
 | |
|     SteelSectionSpider.user_agent = MysteelCookieTools.user_agent
 | |
|     file_path = run_spider(SteelSectionSpider)
 | |
|     # 入库
 | |
|     data = json.loads(open(file_path, 'r', encoding='utf-8').read())
 | |
|     with ClientApp().app_context():
 | |
|         for item in data:
 | |
|             print(item)
 | |
|             SteelSection(**item).upsert()
 | 
