init

2024-05-29 10:21:31 +08:00
commit 54ac29d27b
119 changed files with 6817 additions and 0 deletions
--- a/web/spiders/oil.py
+++ b/web/spiders/oil.py
@@ -0,0 +1,83 @@
+import json
+
+import scrapy
+from scrapy import Request
+
+
+class OilSpider(scrapy.Spider):
+    name = "oil"
+    start_urls = [
+        ('成品油价格调整', "https://fgw.fujian.gov.cn/was5/web/search?channelid=217025&templet=advsch.jsp&sortfield=-docreltime&classsql=%25%E6%88%90%E5%93%81%E6%B2%B9%E4%BB%B7%E6%A0%BC%E8%B0%83%E6%95%B4%25*siteid%3D31*siteid%3D31&prepage=100&page=1"),
+    ]
+
+    def start_requests(self):
+        for source, url in self.start_urls:
+            yield Request(
+                method='GET',
+                url=url,
+                headers={
+                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
+                },
+                meta={'source': source, 'type': 'list'}
+            )
+
+    def parse(self, response, **kwargs):
+        if response.meta['type'] == 'list':
+            yield from self.parse_list(response)
+        elif response.meta['type'] == 'detail':
+            yield from self.parse_detail(response)
+
+    def parse_list(self, response):
+        ret = json.loads(response.text.replace('\n', ''))
+        if ret.get('count') and ret.get('docs'):
+            for item in ret['docs']:
+                if not item.get('title2'):
+                    continue
+                print(f"{item['title2']} {item['pubtime']}")
+                yield Request(
+                    method='GET',
+                    url=item['url'],
+                    meta={'source': f"{item['title2']} {item['pubtime']}", 'type': 'detail', 'time': item['pubtime']}
+                )
+
+    def parse_detail(self, response):
+        for item in response.xpath('//table[1]/tbody/tr'):
+            if len([i.get() for i in item.xpath('td/span/text()')]) > 0:
+                first_word = item.xpath('td/span/text()').get()
+                print()
+                if first_word.strip() == '油品' or first_word.strip() == '元/吨':
+                    continue
+                name = first_word
+                price, *_ = [i.get() for i in item.xpath('td/text()')]
+                yield {
+                    'name': name,
+                    'price': int(price),
+                    'date': response.meta['time'].split(' ')[0],
+                }
+            elif len([i.get() for i in item.xpath('td/text()')]) > 0:
+                first_word = item.xpath('td/text()').get()
+                if first_word.strip() == '油品' or first_word.strip() == '元/吨' or first_word.startswith('\xa0') or first_word.startswith('\n'):
+                    continue
+                name, price, *_ = [i.get() for i in item.xpath('td/text()')]
+                yield {
+                    'name': name,
+                    'price': price,
+                    'date': response.meta['time'].split(' ')[0],
+                }
+            else:
+                print()
+
+
+if __name__ == '__main__':
+    from spiders import run_spider
+    from commons.models.oil import Oil
+    from core.factory import ClientApp
+
+    # 爬取
+    file_path = run_spider(OilSpider)
+    # 入库
+    data = json.loads(open(file_path, 'r', encoding='utf-8').read())
+    with ClientApp().app_context():
+        for item in data:
+            print(item)
+            Oil(**item).upsert()