init

2024-05-29 10:21:31 +08:00
commit 54ac29d27b
119 changed files with 6817 additions and 0 deletions
--- a/web/spiders/cement.py
+++ b/web/spiders/cement.py
@@ -0,0 +1,96 @@
+import scrapy
+from scrapy import Request
+
+
+class CementSpider(scrapy.Spider):
+    name = "cement"
+    start_urls = [
+        ('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/1"),
+        ('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/2"),
+        ('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/3"),
+        ('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/4"),
+        ('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/5"),
+        ('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/6"),
+        ('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/7"),
+        ('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/8"),
+        ('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/9"),
+        ('水泥', "http://www.baiinfo.com/news/newscategory/4596/33/10"),
+    ]
+    cookie = None
+    user_agent = None
+
+    def start_requests(self):
+        for source, url in self.start_urls:
+            yield Request(
+                method='GET',
+                url=url,
+                headers={
+                    'User-Agent': self.user_agent,
+                },
+                cookies=self.cookie,
+                meta={'source': source, 'type': 'home'}
+            )
+
+    def parse(self, response, **kwargs):
+        if response.meta['type'] == 'home':
+            yield from self.parse_home(response)
+        elif response.meta['type'] == 'list_page':
+            yield from self.parse_list_page(response)
+
+    def parse_home(self, response):
+        for item in response.xpath('//*[@id="__nuxt"]/div/div[5]/div/div[2]/div[1]/div[1]/div[2]/ul/li/a'):
+            title = item.xpath('text()').get()
+            if '福建水泥市场参考价格' in title:
+                print(title, 'http://www.baiinfo.com{}'.format(item.xpath('@href').get()))
+                yield Request(
+                    method='GET',
+                    url='http://www.baiinfo.com{}'.format(item.xpath('@href').get()),
+                    headers={
+                        'User-Agent': self.user_agent,
+                    },
+                    cookies=self.cookie,
+                    meta={'source': title, 'type': 'list_page'}
+                )
+
+    def parse_list_page(self, response):
+        date = response.xpath('//*[@id="__nuxt"]/div/div[5]/div/div[2]/div[1]/div[2]/p[1]/span[4]/text()').get()
+        date = date.strip('日').replace('年', '-').replace('月', '-')
+
+        for item in response.xpath('//tr[position()>2]'):
+            block_1 = [cell.get() for cell in item.xpath('td/span/span/span/span/span/text()') if cell.get()]
+            price, *_ = block_1
+
+            block_2 = [cell.get() for cell in item.xpath('td/span/span/span/span/span/span/text()') if cell.get()]
+            spec, name, pack, source, _, fluctuating = block_2
+
+            yield {
+                'name': name,
+                'price': price,
+                'spec': spec,
+                'pack': pack,
+                'date': date,
+                'source': source,
+                'fluctuating': int(fluctuating)
+            }
+
+
+if __name__ == '__main__':
+    import json
+
+    from spiders import run_spider, BaiinfoCookieTools
+    from commons.models.cement import Cement
+    from core.factory import ClientApp
+
+    # cookie 读取
+    cookie = BaiinfoCookieTools.get_cookies()
+    # 爬取
+    CementSpider.cookie = cookie
+    CementSpider.user_agent = BaiinfoCookieTools.user_agent
+    file_path = run_spider(CementSpider)
+    # 入库
+    data = json.loads(open(file_path, 'r', encoding='utf-8').read())
+    with ClientApp().app_context():
+        for item in data:
+            print(item)
+            Cement(**item).upsert()
+