material-api/web/spiders/date_zhejiang.py

import scrapy
from lxml import html, etree
from scrapy import Request


class DataZhejiangSpider(scrapy.Spider):
    name = "data_zhejiang"
    start_urls = [
        ('材料价格', "http://jtyst.zj.gov.cn/col/col1228999576/index.html"),
        ('材料价格', "http://jtyst.zj.gov.cn/col/col1228999576/index.html?uid=5509220&pageNum=2"),
        ('材料价格', "http://jtyst.zj.gov.cn/col/col1228999576/index.html?uid=5509220&pageNum=3"),
        ('材料价格', "http://jtyst.zj.gov.cn/col/col1228999576/index.html?uid=5509220&pageNum=4"),
        ('材料价格', "http://jtyst.zj.gov.cn/col/col1228999576/index.html?uid=5509220&pageNum=5"),
    ]

    def start_requests(self):
        for source, url in self.start_urls:
            yield Request(
                method='GET',
                url=url,
                headers={
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
                },
                meta={'source': source, 'type': 'home'}
            )

    def parse(self, response, **kwargs):
        if response.meta['type'] == 'home':
            yield from self.parse_home(response)
        elif response.meta['type'] == 'list':
            yield from self.parse_list(response)

    def parse_home(self, response):
        rsp = response.xpath('//*[@id="5509220"]/script/text()').get()
        for t in ('<![CDATA[', ']]>', '</record>', '<record>', '</recordset>', '<recordset>', '</datastore>', '<datastore>', '</nextgroup>', '<nextgroup>'):
            rsp = rsp.replace(t, '')
        html = etree.HTML(rsp)
        for item in html.xpath('//li/a'):
            print(item)
            uri = item.xpath('@href')[0]
            name = item.xpath('text()')[0]
            print(uri, name)
            if '《质监与造价》价格信息专辑' not in name:
                continue
            yield Request(
                method='GET',
                url=f'http://jtyst.zj.gov.cn{uri}',
                headers={
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
                },
                meta={'source': name, 'type': 'list'}
            )

    def parse_list(self, response):
        date = response.xpath('/html/body/div[2]/div[1]/div[2]/div[1]/p/span[1]/text()').get().split('日')[0].replace('年', '-').replace('月', '-')
        source = response.xpath('/html/body/div[2]/div[1]/div[2]/div[1]/p/span[2]/text()[2]').get()
        for item in response.xpath('//*[@id="zoom"]/p/a'):
            uri = item.xpath('@href').get()
            name = item.xpath('text()').get()
            print(uri, name)
            yield {
                'url': f'https://jtyst.zj.gov.cn{uri}',
                'name': name,
                # 'source': response.meta['source']
                'source': source,
                'date': date,
            }


if __name__ == '__main__':
    import json

    from spiders import run_spider
    from commons.models.data_zhejiang import DataZhejiang
    from core.factory import ClientApp

    # 爬取
    file_path = run_spider(DataZhejiangSpider)
    # 入库
    data = json.loads(open(file_path, 'r', encoding='utf-8').read())
    with ClientApp().app_context():
        for item in data:
            print(item)
            DataZhejiang(**item).upsert()