feat: 导入福州历史数据

2024-07-11 09:13:30 +08:00
parent 967bb57136
commit 635279eba0
8 changed files with 404 additions and 30 deletions
--- a/requirements.txt
+++ b/requirements.txt
@@ -24,4 +24,7 @@ gunicorn

 scrapy
 Twisted==22.10.0
-selenium
+selenium
+
+openpyxl
+xls2xlsx
--- a/web/commons/models/asphalt_modifier.py
+++ b/web/commons/models/asphalt_modifier.py
@@ -3,10 +3,12 @@ import datetime
 from dateutil.relativedelta import relativedelta
 from sqlalchemy import Column, Integer, String, Numeric, Date, UniqueConstraint

+from commons.models.mixin.base import BaseModelMixin
+from commons.models.model import Model
 from core.extensions import db


-class AsphaltModifier(db.Model):
+class AsphaltModifier(db.Model, Model, BaseModelMixin):
    __tablename__ = 'ASPHALT_MODIFIER'
    id = Column('ID', Integer, primary_key=True)
    name = Column('NAME', String(128), default='', comment='名称')
@@ -19,6 +21,14 @@ class AsphaltModifier(db.Model):
        {'comment': '沥青改性剂'},
    )

+    def find_by_key(self):
+        query = AsphaltModifier.query
+        query = query.filter(AsphaltModifier.name == self.name)
+        query = query.filter(AsphaltModifier.spec == self.spec)
+        query = query.filter(AsphaltModifier.date == self.date)
+        result = query.one_or_none()
+        return result
+
    @classmethod
    def get_query(cls, year=None, month=None, name=None, spec=None, name_in=None):
        query = cls.query
--- a/web/commons/models/fujian_survey.py
+++ b/web/commons/models/fujian_survey.py
@@ -3,10 +3,12 @@ import datetime
 from dateutil.relativedelta import relativedelta
 from sqlalchemy import Column, Integer, String, Date, UniqueConstraint, Numeric

+from commons.models.mixin.base import BaseModelMixin
+from commons.models.model import Model
 from core.extensions import db


-class FujianSurvey(db.Model):
+class FujianSurvey(db.Model, Model, BaseModelMixin):
    __tablename__ = 'FUJIAN_SURVEY'
    id = Column('ID', Integer, primary_key=True)
    name = Column('NAME', String(128), default='', comment='名称')
@@ -20,10 +22,19 @@ class FujianSurvey(db.Model):
    region = Column('REGION', String(128), comment='地区')

    __table_args__ = (
-        UniqueConstraint(name, spec, date, name='Idx_key'),
+        UniqueConstraint(name, spec, date, region, name='Idx_key'),
        {'comment': '福建省交通工程材料调查表'},
    )

+    def find_by_key(self):
+        query = FujianSurvey.query
+        query = query.filter(FujianSurvey.name == self.name)
+        query = query.filter(FujianSurvey.spec == self.spec)
+        query = query.filter(FujianSurvey.region == self.region)
+        query = query.filter(FujianSurvey.date == self.date)
+        result = query.one_or_none()
+        return result
+
    @classmethod
    def get_query(cls, year=None, month=None, name=None, spec=None, name_in=None, region='福州'):
        query = cls.query
--- a/web/commons/models/fuzhou_highway_bureau.py
+++ b/web/commons/models/fuzhou_highway_bureau.py
@@ -3,10 +3,12 @@ import datetime
 from dateutil.relativedelta import relativedelta
 from sqlalchemy import Column, Integer, String, Date, UniqueConstraint, Numeric

+from commons.models.mixin.base import BaseModelMixin
+from commons.models.model import Model
 from core.extensions import db


-class FuzhouHighwayBureau(db.Model):
+class FuzhouHighwayBureau(db.Model, Model, BaseModelMixin):
    __tablename__ = 'FUZHOU_HIGHWAY_BUREAU'
    id = Column('ID', Integer, primary_key=True)
    name = Column('NAME', String(128), default='', comment='名称')
@@ -23,6 +25,15 @@ class FuzhouHighwayBureau(db.Model):
        {'comment': '福州公路局'},
    )

+    def find_by_key(self):
+        query = FuzhouHighwayBureau.query
+        query = query.filter(FuzhouHighwayBureau.name == self.name)
+        query = query.filter(FuzhouHighwayBureau.spec == self.spec)
+        query = query.filter(FuzhouHighwayBureau.region == self.region)
+        query = query.filter(FuzhouHighwayBureau.date == self.date)
+        result = query.one_or_none()
+        return result
+
    @classmethod
    def get_query(cls, year, month, name, spec=None, region='福州'):
        start_date = datetime.date(year, month, 1)
--- a/web/commons/models/fuzhou_transportation_bureau.py
+++ b/web/commons/models/fuzhou_transportation_bureau.py
@@ -3,10 +3,12 @@ import datetime
 from dateutil.relativedelta import relativedelta
 from sqlalchemy import Column, Integer, String, Date, UniqueConstraint, Numeric

+from commons.models.mixin.base import BaseModelMixin
+from commons.models.model import Model
 from core.extensions import db


-class FuzhouTransportationBureau(db.Model):
+class FuzhouTransportationBureau(db.Model, Model, BaseModelMixin):
    __tablename__ = 'FUZHOU_TRANSPORTATION_BUREAU'
    id = Column('ID', Integer, primary_key=True)
    name = Column('NAME', String(128), default='', comment='名称')
@@ -23,6 +25,15 @@ class FuzhouTransportationBureau(db.Model):
        {'comment': '福州交通局'},
    )

+    def find_by_key(self):
+        query = FuzhouTransportationBureau.query
+        query = query.filter(FuzhouTransportationBureau.name == self.name)
+        query = query.filter(FuzhouTransportationBureau.spec == self.spec)
+        query = query.filter(FuzhouTransportationBureau.region == self.region)
+        query = query.filter(FuzhouTransportationBureau.date == self.date)
+        result = query.one_or_none()
+        return result
+
    @classmethod
    def get_query(cls, year, month, name, region='福州'):
        start_date = datetime.date(year, month, 1)
--- a/web/commons/models/local_material.py
+++ b/web/commons/models/local_material.py
@@ -1,9 +1,11 @@
 from sqlalchemy import Column, Integer, String, Date, UniqueConstraint, Numeric, Text

+from commons.models.mixin.base import BaseModelMixin
+from commons.models.model import Model
 from core.extensions import db


-class LocalMaterial(db.Model):
+class LocalMaterial(db.Model, Model, BaseModelMixin):
    __tablename__ = 'LOCAL_MATERIAL'
    id = Column('ID', Integer, primary_key=True)
    name = Column('NAME', String(128), default='', comment='名称')
@@ -22,3 +24,15 @@ class LocalMaterial(db.Model):
        UniqueConstraint(name, spec, city, county, date, name='Idx_key'),
        {'comment': '地材'},
    )
+
+    def find_by_key(self):
+        cls = self.__class__
+        query = cls.query
+        query = query.filter(cls.date == self.date)
+        query = query.filter(cls.spec == self.spec)
+        query = query.filter(cls.name == self.name)
+        query = query.filter(cls.city == self.city)
+        query = query.filter(cls.county == self.county)
+        result = query.one_or_none()
+        return result
+
--- a/web/commons/models/sanming_steel.py
+++ b/web/commons/models/sanming_steel.py
@@ -3,10 +3,12 @@ import datetime
 from dateutil.relativedelta import relativedelta
 from sqlalchemy import Column, Integer, String, Numeric, Date, UniqueConstraint

+from commons.models.mixin.base import BaseModelMixin
+from commons.models.model import Model
 from core.extensions import db


-class SanmingSteel(db.Model):
+class SanmingSteel(db.Model, Model, BaseModelMixin):
    __tablename__ = 'SANMING_STEEL'
    id = Column('ID', Integer, primary_key=True)
    name = Column('NAME', String(128), default='', comment='名称')
@@ -17,10 +19,18 @@ class SanmingSteel(db.Model):
    date = Column('DATE', Date, comment='日期')

    __table_args__ = (
-        UniqueConstraint(name, spec, material, date, name='Idx_key'),
+        UniqueConstraint(name, spec, date, name='Idx_key'),
        {'comment': '三明钢铁'},
    )

+    def find_by_key(self):
+        query = SanmingSteel.query
+        query = query.filter(SanmingSteel.name == self.name)
+        query = query.filter(SanmingSteel.spec == self.spec)
+        query = query.filter(SanmingSteel.date == self.date)
+        result = query.one_or_none()
+        return result
+
    @classmethod
    def get_query(cls, year, month, name, spec):
        start_date = datetime.date(year, month, 1)
--- a/web/scripts/unpack_history_data.py
+++ b/web/scripts/unpack_history_data.py
@@ -1,41 +1,320 @@
+import datetime
 import os
 import zipfile
 from pathlib import Path

+from dateutil.relativedelta import relativedelta
+from openpyxl import load_workbook
+from xls2xlsx import XLS2XLSX

-def main(zip_path=r'C:\Users\Administrator\Desktop\材料管理系统模版\造价站近两年价格数据.zip'):
+from commons.models.asphalt_modifier import AsphaltModifier
+from commons.models.data_network import DataNetwork
+from commons.models.fujian_survey import FujianSurvey
+from commons.models.fuzhou_highway_bureau import FuzhouHighwayBureau
+from commons.models.fuzhou_transportation_bureau import FuzhouTransportationBureau
+from commons.models.local_material import LocalMaterial
+from commons.models.sanming_steel import SanmingSteel
+
+
+def unpack(zip_path=r'C:\Users\Administrator\Desktop\材料管理系统模版\造价站近两年价格数据.zip'):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        for file_info in zip_ref.infolist():
            filename = file_info.filename.encode('cp437').decode('gbk')
            is_excel = filename.endswith('.xlsx') or filename.endswith('.xls')
            is_fuzhou = '福州' in filename
            # 地材
-            # if is_excel and is_fuzhou and '地材汇总/' in filename and '福州市交通工程地方材料价格信息汇总表(数据处理)' not in filename\
-            #         and '混凝土价格' not in filename:
-            #     year, month = get_month(filename)
-            #     print(year, month, filename)
-            # 调查表
-            if is_excel and is_fuzhou and '11地市调查表/' in filename and '福建省' not in filename and '通闽公司' not in filename and '-' not in filename:
+            if is_excel and is_fuzhou and '地材汇总/' in filename and '福州市交通工程地方材料价格信息汇总表(数据处理)' not in filename\
+                    and '混凝土价格' not in filename:
                year, month = get_month(filename)
                print(year, month, filename)
-                target_file_path = f'./{year}-{month}-调查表-福州{Path(filename).suffix}'
-                with open(target_file_path, 'wb') as f:
-                    f.write(zip_ref.read(file_info))
+                yield zip_ref, file_info, f'./地材-福州-{year}-{int(month):02d}{Path(filename).suffix}'
+            # 调查表
+            elif is_excel and is_fuzhou and '11地市调查表/' in filename and '福建省' not in filename and '通闽公司' not in filename and '-' not in filename:
+                year, month = get_month(filename)
+                print(year, month, filename)
+                yield zip_ref, file_info, f'./调查表-福州-{year}-{int(month):02d}{Path(filename).suffix}'
            # 公路局
-            # if is_excel and is_fuzhou and '4.福州公路' in filename:
-            #     print(filename)
+            elif is_excel and is_fuzhou and '4.福州公路' in filename:
+                year, month = get_month(filename)
+                print(year, month, filename)
+                yield zip_ref, file_info, f'./公路局-福州-{year}-{int(month):02d}{Path(filename).suffix}'
            # 交通局
-            # if is_excel and is_fuzhou and '1.福州交通局' in filename:
-            #     print(filename)
+            elif is_excel and is_fuzhou and '1.福州交通局' in filename:
+                year, month = get_month(filename)
+                print(year, month, filename)
+                yield zip_ref, file_info, f'./交通局-福州-{year}-{int(month):02d}{Path(filename).suffix}'
            # 网络价格
-            # if is_excel and is_fuzhou and '网络价格（' in filename and '通闽公司' not in filename and '5.福州' not in filename:
-            #     print(filename)
+            elif is_excel and is_fuzhou and '网络价格（' in filename and '通闽公司' not in filename and '5.福州' not in filename:
+                year, month = get_month(filename)
+                print(year, month, filename)
+                yield zip_ref, file_info, f'./网络价格-福州-{year}-{int(month):02d}{Path(filename).suffix}'
            # 改性剂
-            # if is_excel and '改性剂和沙钢' in filename :
-            #     print(filename)
+            elif is_excel and '改性剂和沙钢' in filename :
+                year, month = get_month(filename)
+                print(year, month, filename)
+                yield zip_ref, file_info, f'./改性剂-福州-{year}-{int(month):02d}{Path(filename).suffix}'
            # 三明钢铁
-            # if is_excel and '2.三明钢铁' in filename:
-            #     print(filename)
+            elif is_excel and '2.三明钢铁' in filename:
+                year, month = get_month(filename)
+                print(year, month, filename)
+                yield zip_ref, file_info, f'./三明钢铁-福州-{year}-{int(month):02d}{Path(filename).suffix}'
+
+
+def main():
+    for zip_ref, file_info, target_file_path in unpack():
+        with open(target_file_path, 'wb') as f:
+            f.write(zip_ref.read(file_info))
+        if target_file_path.endswith('.xls'):
+            x2x = XLS2XLSX(target_file_path)
+            x2x.to_xlsx(target_file_path.replace('.xls', '.xlsx'))
+
+
+class Cleaner:
+    city = '福州市'
+
+    @classmethod
+    def clean_gxj(cls, table=r'E:\Project\material_api\web\scripts\改性剂-福州-2022-12.xlsx'):
+        """
+        改性剂
+        """
+        wb = load_workbook(file_path)
+        ws = wb.active
+        _, _, year, month, *_ = file.split('.')[0].split('-')
+        name = None
+        for row in ws.iter_rows(values_only=True):
+            for k, cell in enumerate(row):
+                if not name and cell == '日期':
+                    name = row[k+1]
+                if isinstance(cell, datetime.datetime):
+                    start = datetime.datetime(int(year), int(month), 1)
+                    end = start + relativedelta(months=1)
+                    if start <= cell < end:
+                        print(year, month, name, cell, row[k+1])
+                        yield {
+                            'name': name,
+                            'spec': '',
+                            'price': row[k+1],
+                            'date': cell,
+                        }
+
+    @classmethod
+    def clean_ss(cls, table=r'E:\Project\material_api\web\scripts\三明钢铁-福州-2022-12.xlsx'):
+        """
+        三明钢铁
+        """
+        wb = load_workbook(file_path)
+        ws = wb.active
+        _, _, year, month, *_ = file.split('.')[0].split('-')
+        result = []
+        for i, row in enumerate(ws.iter_rows(values_only=True)):
+            if isinstance(row[0], str) and ('材料编号' in row[0] or '三明钢铁' in row[0]):
+                continue
+            material, name, spec, _, _, price, *_ = row
+            if not name:
+                name = result[-1][3]
+            # if not spec:
+            #     spec = result[-1][4]
+            item = (year, month, material, name, spec, price)
+            result.append(item)
+            if not item[5]:
+                continue
+            print(item)
+            yield {
+                'name': name.replace(' ', '') if name else '',
+                'spec': spec.replace(' ', '') if spec else '',
+                'material': material,
+                'price': price,
+                'fluctuating': None,
+                'date': datetime.datetime(int(year), int(month), 1),
+            }
+
+    @classmethod
+    def clean_jtj(cls, table=r'E:\Project\material_api\web\scripts\交通局-福州-2022-12.xlsx'):
+        """
+        交通局
+        """
+        wb = load_workbook(file_path)
+        ws = wb.active
+        _, _, year, month, *_ = file.split('.')[0].split('-')
+        result = []
+        for i, row in enumerate(ws.iter_rows(values_only=True)):
+            if isinstance(row[0], str) and ('福建' in row[0] or '、' in row[0] or '材料' in row[0] or '电话' in row[0]):
+                continue
+            if not row[0] and not row[1] and not row[2]:
+                continue
+            material, name, spec, unit, brand, price, *_ = row
+            # if not name:
+            #     name = result[-1][3]
+            # if not spec:
+            #     spec = result[-1][4]
+            item = (year, month, material, name, spec, unit, brand, price)
+            result.append(item)
+            if not item[7] or item[7] == '-':
+                continue
+            print(item)
+            yield {
+                'name': name.replace(' ', '') if name else '',
+                'spec': spec.replace(' ', '') if spec else '',
+                'price': price,
+                'date': datetime.datetime(int(year), int(month), 1),
+                'material_id': material,
+                'unit': unit,
+                'brand': brand,
+                'region': cls.city.replace('市', ''),
+            }
+
+    @classmethod
+    def clean_glj(cls, table=r'E:\Project\material_api\web\scripts\公路局-福州-2022-12.xlsx'):
+        """
+        公路局
+        """
+        wb = load_workbook(file_path)
+        ws = wb.active
+        _, _, year, month, *_ = file.split('.')[0].split('-')
+        result = []
+        for i, row in enumerate(ws.iter_rows(values_only=True)):
+            if isinstance(row[0], str) and ('福建' in row[0] or '、' in row[0] or '材料' in row[0] or '电话' in row[0]
+                                            or '附件' in row[0] or '填报' in row[0]):
+                continue
+            if not row[0] and not row[1] and not row[2]:
+                continue
+            material, name, spec, unit, brand, price, source, remark, *_ = row
+            if not name:
+                name = result[-1][3]
+            # if not spec:
+            #     spec = result[-1][4]
+            item = (year, month, material, name, spec, unit, brand, price)
+            result.append(item)
+            if not item[7] or item[7] == '-':
+                continue
+            print(item)
+            yield {
+                'name': name.replace(' ', '') if name else '',
+                'spec': spec.replace(' ', '') if spec else '',
+                'price': price,
+                'date': datetime.datetime(int(year), int(month), 1),
+                'material_id': material,
+                'unit': unit,
+                'brand': brand,
+                'region': cls.city.replace('市', ''),
+            }
+
+    @classmethod
+    def clean_dcb(cls, table=r'E:\Project\material_api\web\scripts\调查表-福州-2022-12.xlsx'):
+        """
+        调查表
+        """
+        wb = load_workbook(file_path)
+        ws = wb.active
+        _, _, year, month, *_ = file.split('.')[0].split('-')
+        result = []
+        for i, row in enumerate(ws.iter_rows(values_only=True)):
+            print(row)
+            if isinstance(row[0], str) and ('福建' in row[0] or '、' in row[0] or '材料' in row[0] or '电话' in row[0]
+                                            or '附件' in row[0] or '填报' in row[0]):
+                continue
+            if not row[0] and not row[1] and not row[2]:
+                continue
+            material, name, spec, unit, brand, price, price_without_tax, tax, source, remark, *_ = row
+            # if not name:
+            #     name = result[-1][3]
+            # # if not spec:
+            # #     spec = result[-1][4]
+            item = (year, month, material, name, spec, unit, brand, price)
+            result.append(item)
+            if not price or price == '-':
+                continue
+            print(item)
+            yield {
+                'name': name.replace(' ', '') if name else '',
+                'spec': spec.replace(' ', '') if spec else '',
+                'price': price,
+                'date': datetime.datetime(int(year), int(month), 1),
+                'material_id': material,
+                'unit': unit,
+                'brand': brand,
+                'tax': tax,
+                'region': cls.city.replace('市', ''),
+            }
+
+    @classmethod
+    def clean_network(cls, table=r'E:\Project\material_api\web\scripts\网络价格-福州-2022-12.xlsx'):
+        """
+        网络价格
+        """
+        wb = load_workbook(file_path)
+        ws = wb.active
+        _, _, year, month, *_ = file.split('.')[0].split('-')
+        result = []
+        for i, row in enumerate(ws.iter_rows(values_only=True)):
+            if isinstance(row[0], str) and ('网络' in row[0] or '、' in row[0] or '材料' in row[0] or '电话' in row[0]
+                                            or '附件' in row[0] or '填报' in row[0]):
+                continue
+            if not row[0] and not row[1] and not row[2]:
+                continue
+            material, name, spec, unit, brand, price, _, remark, source, *_ = row
+            # # if not name:
+            # #     name = result[-1][3]
+            # # # if not spec:
+            # # #     spec = result[-1][4]
+            item = (year, month, material, name, spec, unit, brand, price, source, remark)
+            result.append(item)
+            if not item[7] or item[7] == '-':
+                continue
+            print(item)
+            yield {
+                'material_id': material,
+                'spec': spec.replace(' ', '') if spec else '',
+                'unit': unit,
+                'brand': brand,
+                'name': name.replace(' ', '') if name else '',
+                'price': price,
+                'source': source,
+                'remark': remark,
+                'date':  datetime.datetime(int(year), int(month), 1),
+                'region': cls.city.replace('市', ''),
+            }
+
+    @classmethod
+    def clean_local(cls, table=r'E:\Project\material_api\web\scripts\地材-福州-2022-12.xlsx'):
+        """
+        地材
+        """
+        wb = load_workbook(file_path)
+        ws = wb.active
+        _, _, year, month, *_ = file.split('.')[0].split('-')
+        result = []
+        for i, row in enumerate(ws.iter_rows(values_only=True)):
+            if isinstance(row[0], str) and ('汇总表' in row[0] or '、' in row[0] or '材料' in row[0] or '电话' in row[0]
+                                            or '附件' in row[0] or '填报' in row[0] or '单位' in row[0] or '序号' in row[0]):
+                continue
+            if not row[0] and not row[1] and not row[2]:
+                continue
+            _, county, name, spec, unit, price, price_without_tax, position, remark, *_ = row
+            if not county:
+                county = result[-1][2]
+            if not name:
+                name = result[-1][3]
+            item = (year, month, county, name, spec, unit, price, price_without_tax, position, remark)
+            result.append(item)
+            if not price or price in ('-', '∕', '/'):
+                continue
+            if not price_without_tax or price_without_tax in ('-', '∕', '/'):
+                price_without_tax = 0
+            print(item)
+            yield {
+                'name': name.replace(' ', '') if name else '',
+                'city': cls.city,
+                'county': county,
+                'material_id': '',
+                'spec': spec.replace(' ', '') if spec else '',
+                'unit': unit,
+                'price': price,
+                'price_without_tax': price_without_tax,
+                'date': datetime.datetime(int(year), int(month), 1),
+                'position': position,
+                'remark': remark,
+            }


 def get_month(filename):
@@ -50,4 +329,29 @@ def get_month(filename):


 if __name__ == '__main__':
-    main()
+    from core.factory import ClientApp
+    with ClientApp().app_context():
+        for root, dirs, files in os.walk(r'E:\Project\material_api\web\scripts'):
+            for file in files:
+                file_path = os.path.join(root, file)
+                if '改性剂' in file and '.xlsx' in file:
+                    for item in Cleaner.clean_gxj(file_path):
+                        AsphaltModifier(**item).upsert()
+                if '三明钢铁' in file and '.xlsx' in file:
+                    for item in Cleaner.clean_ss(file_path):
+                        SanmingSteel(**item).upsert()
+                if '交通局' in file and '.xlsx' in file:
+                    for item in Cleaner.clean_jtj(file_path):
+                        FuzhouTransportationBureau(**item).upsert()
+                if '公路局' in file and '.xlsx' in file:
+                    for item in Cleaner.clean_glj(file_path):
+                        FuzhouHighwayBureau(**item).upsert()
+                if '调查表' in file and '.xlsx' in file:
+                    for item in Cleaner.clean_dcb(file_path):
+                        FujianSurvey(**item).upsert()
+                if '网络价格' in file and '.xlsx' in file:
+                    for item in Cleaner.clean_network(file_path):
+                        DataNetwork(**item).upsert()
+                if '地材' in file and '.xlsx' in file:
+                    for item in Cleaner.clean_local(file_path):
+                        LocalMaterial(**item).upsert()