material-api/web/spiders/cement.py

import scrapy
from scrapy import Request


class CementSpider(scrapy.Spider):
    name = "cement"
    start_urls = [
        ('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/1"),
        ('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/2"),
        ('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/3"),
        ('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/4"),
        ('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/5"),
        ('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/6"),
        ('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/7"),
        ('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/8"),
        ('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/9"),
        ('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/10"),
    ]
    cookie = None
    user_agent = None

    def start_requests(self):
        for source, url in self.start_urls:
            yield Request(
                method='GET',
                url=url,
                headers={
                    'User-Agent': self.user_agent,
                },
                cookies=self.cookie,
                meta={'source': source, 'type': 'home'}
            )

    def parse(self, response, **kwargs):
        if response.meta['type'] == 'home':
            yield from self.parse_home(response)
        elif response.meta['type'] == 'list_page':
            yield from self.parse_list_page(response)

    def parse_home(self, response):
        for item in response.xpath('//*[@id="__nuxt"]/div/div[5]/div/div[2]/div[1]/div[1]/div[2]/ul/li/a'):
            title = item.xpath('text()').get()
            if '福建水泥市场参考价格' in title:
                print(title, 'http://www.baiinfo.com{}'.format(item.xpath('@href').get()))
                yield Request(
                    method='GET',
                    url='http://www.baiinfo.com{}'.format(item.xpath('@href').get()),
                    headers={
                        'User-Agent': self.user_agent,
                    },
                    cookies=self.cookie,
                    meta={'source': title, 'type': 'list_page'}
                )

    def parse_list_page(self, response):
        date = response.xpath('//*[@id="__nuxt"]/div/div[5]/div/div[2]/div[1]/div[2]/p[1]/span[4]/text()').get()
        date = date.strip('日').replace('年', '-').replace('月', '-')

        for item in response.xpath('//tr[position()>2]'):
            block_1 = [cell.get() for cell in item.xpath('td/span/span/span/span/span/text()') if cell.get()]
            price, *_ = block_1

            block_2 = [cell.get() for cell in item.xpath('td/span/span/span/span/span/span/text()') if cell.get()]
            spec, name, pack, source, _, fluctuating = block_2

            yield {
                'name': name,
                'price': price,
                'spec': spec,
                'pack': pack,
                'date': date,
                'source': source,
                'fluctuating': int(fluctuating)
            }


if __name__ == '__main__':
    import json

    from spiders import run_spider, BaiinfoCookieTools
    from commons.models.cement import Cement
    from core.factory import ClientApp

    # cookie 读取
    cookie = BaiinfoCookieTools.get_cookies()
    # 爬取
    CementSpider.cookie = cookie
    CementSpider.user_agent = BaiinfoCookieTools.user_agent
    file_path = run_spider(CementSpider)
    # 入库
    data = json.loads(open(file_path, 'r', encoding='utf-8').read())
    with ClientApp().app_context():
        for item in data:
            print(item)
            Cement(**item).upsert()