init

2024-05-29 10:21:31 +08:00
commit 54ac29d27b
119 changed files with 6817 additions and 0 deletions
--- a/web/spiders/data_fujian.py
+++ b/web/spiders/data_fujian.py
@@ -0,0 +1,91 @@
+import datetime
+from dateutil.relativedelta import relativedelta
+
+import scrapy
+from scrapy import Request
+
+
+CITY_ID = [
+    ('93', '福州市'),
+    ('94', '厦门市'),
+    ('95', '宁德市'),
+    ('96', '莆田市'),
+    ('97', '泉州市'),
+    ('98', '漳州市'),
+    ('99', '龙岩市'),
+    ('100', '三明市'),
+    ('101', '南平市'),
+    ('102', '平潭综合实验区'),
+]
+
+CLASS_CODE = [
+    ('01', '01黑色及有色金属'),
+    ('04', '04水泥、砖瓦灰砂'),
+    ('05', '05木、竹材料及其制品'),
+    ('13', '13涂料及防腐、防水材料'),
+    ('14', '14油品、化工原料'),
+]
+
+URL = 'http://49.4.85.126/Information/Index?qClassCode={class_code}&qMatType=0&WayID=14&WayID2=4&CityID=7&CityID2={city_id}&Year={year}&Month={month}&Week=0&Day=0&qKeyWord='
+MONTHS = 2  # 爬取最近月份数量
+
+
+class DataFujianSpider(scrapy.Spider):
+    name = "data_fujian"
+
+    def start_requests(self):
+        for city_id, city_name in CITY_ID:
+            for class_code, source in CLASS_CODE:
+                for month in range(1, 1 + MONTHS):
+                    date = datetime.date.today() - relativedelta(months=month)
+                    yield Request(
+                        method='GET',
+                        url=URL.format(year=date.year, month=date.month, class_code=class_code, city_id=city_id),
+                        headers={
+                            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
+                        },
+                        meta={'source': source, 'type': 'home', 'city': city_name, 'month': date.month, 'year': date.year}
+                    )
+
+    def parse(self, response, **kwargs):
+        for item in response.xpath('//*[@id="searcList"]/div/div[2]/div[3]/div[3]/table/tr'):
+            block_1 = [i.xpath('text()').get().strip() for i in item.xpath('td')]
+            print(block_1)
+            if not block_1:
+                continue
+            number, _, type_, unit, price, price_with_tax, *_ = block_1
+
+            block_2 = [i.xpath('text()').get().strip() for i in item.xpath('td/span')]
+            print(block_2)
+            name, *_ = block_2
+
+            yield {
+                'number': number,
+                'name': name,
+                'spec': type_,
+                'unit': unit,
+                'price_without_tax': price,
+                'price': price_with_tax,
+                'category': response.meta['source'],
+                'year': response.meta['year'],
+                'month': response.meta['month'],
+                'city': response.meta['city'],
+                'date': datetime.date.today().strftime('%Y-%m-%d')
+            }
+
+
+if __name__ == '__main__':
+    import json
+
+    from spiders import run_spider
+    from commons.models.data_fujian import DataFujian
+    from core.factory import ClientApp
+
+    # 爬取
+    file_path = run_spider(DataFujianSpider)
+    # 入库
+    data = json.loads(open(file_path, 'r', encoding='utf-8').read())
+    with ClientApp().app_context():
+        for item in data:
+            print(item)
+            DataFujian(**item).upsert()