init

2024-05-29 10:21:31 +08:00
commit 54ac29d27b
119 changed files with 6817 additions and 0 deletions
--- a/web/spiders/data_guangdong.py
+++ b/web/spiders/data_guangdong.py
@@ -0,0 +1,78 @@
+import scrapy
+from scrapy import Request
+
+
+class DataGuangdongSpider(scrapy.Spider):
+    name = "data_guangdong"
+    start_urls = [
+        ('材料信息价', "http://zjz.gdcd.gov.cn/zjzgdcd/zjxx_clxxj/list.shtml"),
+        ('材料信息价', "http://zjz.gdcd.gov.cn/zjzgdcd/zjxx_clxxj/list_2.shtml"),
+        ('材料信息价', "http://zjz.gdcd.gov.cn/zjzgdcd/zjxx_clxxj/list_3.shtml"),
+        ('材料信息价', "http://zjz.gdcd.gov.cn/zjzgdcd/zjxx_clxxj/list_4.shtml"),
+        ('材料信息价', "http://zjz.gdcd.gov.cn/zjzgdcd/zjxx_clxxj/list_5.shtml"),
+    ]
+
+    def start_requests(self):
+        for source, url in self.start_urls:
+            yield Request(
+                method='GET',
+                url=url,
+                headers={
+                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
+                },
+                meta={'source': source, 'type': 'home'}
+            )
+
+    def parse(self, response, **kwargs):
+        if response.meta['type'] == 'home':
+            yield from self.parse_home(response)
+        elif response.meta['type'] == 'list':
+            yield from self.parse_list(response)
+
+    def parse_home(self, response):
+        for item in response.xpath('/html/body/div/div[3]/div[2]/div[2]/div/ul/li/a'):
+            uri = item.xpath('@href').get()
+            name = item.xpath('text()').get()
+            if '广东省交通建设工程主要外购材料信息价表' not in name:
+                continue
+            yield Request(
+                method='GET',
+                url=f'http://zjz.gdcd.gov.cn{uri}',
+                headers={
+                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
+                },
+                meta={'source': name, 'type': 'list'}
+            )
+
+    def parse_list(self, response):
+        date = response.xpath('/html/body/div[1]/div[4]/div/div[1]/div[2]/span[2]/text()').get().strip('发布时间:')
+        source = response.xpath('/html/body/div[1]/div[4]/div/div[1]/div[2]/span[1]/b/text()').get()
+        for item in response.xpath('//*[@id="zoomcon"]/p/a'):
+            uri = item.xpath('@href').get()
+            name = item.xpath('text()').get()
+            url_prefix = '/'.join(response.url.split('/')[:-1])
+            print(uri, name)
+            yield {
+                'url': f'{url_prefix}/{uri}',
+                'name': name,
+                # 'source': response.meta['source'],
+                'source': source,
+                'date': date
+            }
+
+
+if __name__ == '__main__':
+    import json
+
+    from spiders import run_spider
+    from commons.models.data_guangdong import DataGuangdong
+    from core.factory import ClientApp
+
+    # 爬取
+    file_path = run_spider(DataGuangdongSpider)
+    # 入库
+    data = json.loads(open(file_path, 'r', encoding='utf-8').read())
+    with ClientApp().app_context():
+        for item in data:
+            print(item)
+            DataGuangdong(**item).upsert()