feat: 导入福州历史数据

This commit is contained in:
han0
2024-07-11 09:13:30 +08:00
parent 967bb57136
commit 635279eba0
8 changed files with 404 additions and 30 deletions

View File

@@ -24,4 +24,7 @@ gunicorn
scrapy scrapy
Twisted==22.10.0 Twisted==22.10.0
selenium selenium
openpyxl
xls2xlsx

View File

@@ -3,10 +3,12 @@ import datetime
from dateutil.relativedelta import relativedelta from dateutil.relativedelta import relativedelta
from sqlalchemy import Column, Integer, String, Numeric, Date, UniqueConstraint from sqlalchemy import Column, Integer, String, Numeric, Date, UniqueConstraint
from commons.models.mixin.base import BaseModelMixin
from commons.models.model import Model
from core.extensions import db from core.extensions import db
class AsphaltModifier(db.Model): class AsphaltModifier(db.Model, Model, BaseModelMixin):
__tablename__ = 'ASPHALT_MODIFIER' __tablename__ = 'ASPHALT_MODIFIER'
id = Column('ID', Integer, primary_key=True) id = Column('ID', Integer, primary_key=True)
name = Column('NAME', String(128), default='', comment='名称') name = Column('NAME', String(128), default='', comment='名称')
@@ -19,6 +21,14 @@ class AsphaltModifier(db.Model):
{'comment': '沥青改性剂'}, {'comment': '沥青改性剂'},
) )
def find_by_key(self):
query = AsphaltModifier.query
query = query.filter(AsphaltModifier.name == self.name)
query = query.filter(AsphaltModifier.spec == self.spec)
query = query.filter(AsphaltModifier.date == self.date)
result = query.one_or_none()
return result
@classmethod @classmethod
def get_query(cls, year=None, month=None, name=None, spec=None, name_in=None): def get_query(cls, year=None, month=None, name=None, spec=None, name_in=None):
query = cls.query query = cls.query

View File

@@ -3,10 +3,12 @@ import datetime
from dateutil.relativedelta import relativedelta from dateutil.relativedelta import relativedelta
from sqlalchemy import Column, Integer, String, Date, UniqueConstraint, Numeric from sqlalchemy import Column, Integer, String, Date, UniqueConstraint, Numeric
from commons.models.mixin.base import BaseModelMixin
from commons.models.model import Model
from core.extensions import db from core.extensions import db
class FujianSurvey(db.Model): class FujianSurvey(db.Model, Model, BaseModelMixin):
__tablename__ = 'FUJIAN_SURVEY' __tablename__ = 'FUJIAN_SURVEY'
id = Column('ID', Integer, primary_key=True) id = Column('ID', Integer, primary_key=True)
name = Column('NAME', String(128), default='', comment='名称') name = Column('NAME', String(128), default='', comment='名称')
@@ -20,10 +22,19 @@ class FujianSurvey(db.Model):
region = Column('REGION', String(128), comment='地区') region = Column('REGION', String(128), comment='地区')
__table_args__ = ( __table_args__ = (
UniqueConstraint(name, spec, date, name='Idx_key'), UniqueConstraint(name, spec, date, region, name='Idx_key'),
{'comment': '福建省交通工程材料调查表'}, {'comment': '福建省交通工程材料调查表'},
) )
def find_by_key(self):
query = FujianSurvey.query
query = query.filter(FujianSurvey.name == self.name)
query = query.filter(FujianSurvey.spec == self.spec)
query = query.filter(FujianSurvey.region == self.region)
query = query.filter(FujianSurvey.date == self.date)
result = query.one_or_none()
return result
@classmethod @classmethod
def get_query(cls, year=None, month=None, name=None, spec=None, name_in=None, region='福州'): def get_query(cls, year=None, month=None, name=None, spec=None, name_in=None, region='福州'):
query = cls.query query = cls.query

View File

@@ -3,10 +3,12 @@ import datetime
from dateutil.relativedelta import relativedelta from dateutil.relativedelta import relativedelta
from sqlalchemy import Column, Integer, String, Date, UniqueConstraint, Numeric from sqlalchemy import Column, Integer, String, Date, UniqueConstraint, Numeric
from commons.models.mixin.base import BaseModelMixin
from commons.models.model import Model
from core.extensions import db from core.extensions import db
class FuzhouHighwayBureau(db.Model): class FuzhouHighwayBureau(db.Model, Model, BaseModelMixin):
__tablename__ = 'FUZHOU_HIGHWAY_BUREAU' __tablename__ = 'FUZHOU_HIGHWAY_BUREAU'
id = Column('ID', Integer, primary_key=True) id = Column('ID', Integer, primary_key=True)
name = Column('NAME', String(128), default='', comment='名称') name = Column('NAME', String(128), default='', comment='名称')
@@ -23,6 +25,15 @@ class FuzhouHighwayBureau(db.Model):
{'comment': '福州公路局'}, {'comment': '福州公路局'},
) )
def find_by_key(self):
query = FuzhouHighwayBureau.query
query = query.filter(FuzhouHighwayBureau.name == self.name)
query = query.filter(FuzhouHighwayBureau.spec == self.spec)
query = query.filter(FuzhouHighwayBureau.region == self.region)
query = query.filter(FuzhouHighwayBureau.date == self.date)
result = query.one_or_none()
return result
@classmethod @classmethod
def get_query(cls, year, month, name, spec=None, region='福州'): def get_query(cls, year, month, name, spec=None, region='福州'):
start_date = datetime.date(year, month, 1) start_date = datetime.date(year, month, 1)

View File

@@ -3,10 +3,12 @@ import datetime
from dateutil.relativedelta import relativedelta from dateutil.relativedelta import relativedelta
from sqlalchemy import Column, Integer, String, Date, UniqueConstraint, Numeric from sqlalchemy import Column, Integer, String, Date, UniqueConstraint, Numeric
from commons.models.mixin.base import BaseModelMixin
from commons.models.model import Model
from core.extensions import db from core.extensions import db
class FuzhouTransportationBureau(db.Model): class FuzhouTransportationBureau(db.Model, Model, BaseModelMixin):
__tablename__ = 'FUZHOU_TRANSPORTATION_BUREAU' __tablename__ = 'FUZHOU_TRANSPORTATION_BUREAU'
id = Column('ID', Integer, primary_key=True) id = Column('ID', Integer, primary_key=True)
name = Column('NAME', String(128), default='', comment='名称') name = Column('NAME', String(128), default='', comment='名称')
@@ -23,6 +25,15 @@ class FuzhouTransportationBureau(db.Model):
{'comment': '福州交通局'}, {'comment': '福州交通局'},
) )
def find_by_key(self):
query = FuzhouTransportationBureau.query
query = query.filter(FuzhouTransportationBureau.name == self.name)
query = query.filter(FuzhouTransportationBureau.spec == self.spec)
query = query.filter(FuzhouTransportationBureau.region == self.region)
query = query.filter(FuzhouTransportationBureau.date == self.date)
result = query.one_or_none()
return result
@classmethod @classmethod
def get_query(cls, year, month, name, region='福州'): def get_query(cls, year, month, name, region='福州'):
start_date = datetime.date(year, month, 1) start_date = datetime.date(year, month, 1)

View File

@@ -1,9 +1,11 @@
from sqlalchemy import Column, Integer, String, Date, UniqueConstraint, Numeric, Text from sqlalchemy import Column, Integer, String, Date, UniqueConstraint, Numeric, Text
from commons.models.mixin.base import BaseModelMixin
from commons.models.model import Model
from core.extensions import db from core.extensions import db
class LocalMaterial(db.Model): class LocalMaterial(db.Model, Model, BaseModelMixin):
__tablename__ = 'LOCAL_MATERIAL' __tablename__ = 'LOCAL_MATERIAL'
id = Column('ID', Integer, primary_key=True) id = Column('ID', Integer, primary_key=True)
name = Column('NAME', String(128), default='', comment='名称') name = Column('NAME', String(128), default='', comment='名称')
@@ -22,3 +24,15 @@ class LocalMaterial(db.Model):
UniqueConstraint(name, spec, city, county, date, name='Idx_key'), UniqueConstraint(name, spec, city, county, date, name='Idx_key'),
{'comment': '地材'}, {'comment': '地材'},
) )
def find_by_key(self):
cls = self.__class__
query = cls.query
query = query.filter(cls.date == self.date)
query = query.filter(cls.spec == self.spec)
query = query.filter(cls.name == self.name)
query = query.filter(cls.city == self.city)
query = query.filter(cls.county == self.county)
result = query.one_or_none()
return result

View File

@@ -3,10 +3,12 @@ import datetime
from dateutil.relativedelta import relativedelta from dateutil.relativedelta import relativedelta
from sqlalchemy import Column, Integer, String, Numeric, Date, UniqueConstraint from sqlalchemy import Column, Integer, String, Numeric, Date, UniqueConstraint
from commons.models.mixin.base import BaseModelMixin
from commons.models.model import Model
from core.extensions import db from core.extensions import db
class SanmingSteel(db.Model): class SanmingSteel(db.Model, Model, BaseModelMixin):
__tablename__ = 'SANMING_STEEL' __tablename__ = 'SANMING_STEEL'
id = Column('ID', Integer, primary_key=True) id = Column('ID', Integer, primary_key=True)
name = Column('NAME', String(128), default='', comment='名称') name = Column('NAME', String(128), default='', comment='名称')
@@ -17,10 +19,18 @@ class SanmingSteel(db.Model):
date = Column('DATE', Date, comment='日期') date = Column('DATE', Date, comment='日期')
__table_args__ = ( __table_args__ = (
UniqueConstraint(name, spec, material, date, name='Idx_key'), UniqueConstraint(name, spec, date, name='Idx_key'),
{'comment': '三明钢铁'}, {'comment': '三明钢铁'},
) )
def find_by_key(self):
query = SanmingSteel.query
query = query.filter(SanmingSteel.name == self.name)
query = query.filter(SanmingSteel.spec == self.spec)
query = query.filter(SanmingSteel.date == self.date)
result = query.one_or_none()
return result
@classmethod @classmethod
def get_query(cls, year, month, name, spec): def get_query(cls, year, month, name, spec):
start_date = datetime.date(year, month, 1) start_date = datetime.date(year, month, 1)

View File

@@ -1,41 +1,320 @@
import datetime
import os import os
import zipfile import zipfile
from pathlib import Path from pathlib import Path
from dateutil.relativedelta import relativedelta
from openpyxl import load_workbook
from xls2xlsx import XLS2XLSX
def main(zip_path=r'C:\Users\Administrator\Desktop\材料管理系统模版\造价站近两年价格数据.zip'): from commons.models.asphalt_modifier import AsphaltModifier
from commons.models.data_network import DataNetwork
from commons.models.fujian_survey import FujianSurvey
from commons.models.fuzhou_highway_bureau import FuzhouHighwayBureau
from commons.models.fuzhou_transportation_bureau import FuzhouTransportationBureau
from commons.models.local_material import LocalMaterial
from commons.models.sanming_steel import SanmingSteel
def unpack(zip_path=r'C:\Users\Administrator\Desktop\材料管理系统模版\造价站近两年价格数据.zip'):
with zipfile.ZipFile(zip_path, 'r') as zip_ref: with zipfile.ZipFile(zip_path, 'r') as zip_ref:
for file_info in zip_ref.infolist(): for file_info in zip_ref.infolist():
filename = file_info.filename.encode('cp437').decode('gbk') filename = file_info.filename.encode('cp437').decode('gbk')
is_excel = filename.endswith('.xlsx') or filename.endswith('.xls') is_excel = filename.endswith('.xlsx') or filename.endswith('.xls')
is_fuzhou = '福州' in filename is_fuzhou = '福州' in filename
# 地材 # 地材
# if is_excel and is_fuzhou and '地材汇总/' in filename and '福州市交通工程地方材料价格信息汇总表(数据处理)' not in filename\ if is_excel and is_fuzhou and '地材汇总/' in filename and '福州市交通工程地方材料价格信息汇总表(数据处理)' not in filename\
# and '混凝土价格' not in filename: and '混凝土价格' not in filename:
# year, month = get_month(filename)
# print(year, month, filename)
# 调查表
if is_excel and is_fuzhou and '11地市调查表/' in filename and '福建省' not in filename and '通闽公司' not in filename and '-' not in filename:
year, month = get_month(filename) year, month = get_month(filename)
print(year, month, filename) print(year, month, filename)
target_file_path = f'./{year}-{month}-调查表-福州{Path(filename).suffix}' yield zip_ref, file_info, f'./地材-福州-{year}-{int(month):02d}{Path(filename).suffix}'
with open(target_file_path, 'wb') as f: # 调查表
f.write(zip_ref.read(file_info)) elif is_excel and is_fuzhou and '11地市调查表/' in filename and '福建省' not in filename and '通闽公司' not in filename and '-' not in filename:
year, month = get_month(filename)
print(year, month, filename)
yield zip_ref, file_info, f'./调查表-福州-{year}-{int(month):02d}{Path(filename).suffix}'
# 公路局 # 公路局
# if is_excel and is_fuzhou and '4.福州公路' in filename: elif is_excel and is_fuzhou and '4.福州公路' in filename:
# print(filename) year, month = get_month(filename)
print(year, month, filename)
yield zip_ref, file_info, f'./公路局-福州-{year}-{int(month):02d}{Path(filename).suffix}'
# 交通局 # 交通局
# if is_excel and is_fuzhou and '1.福州交通局' in filename: elif is_excel and is_fuzhou and '1.福州交通局' in filename:
# print(filename) year, month = get_month(filename)
print(year, month, filename)
yield zip_ref, file_info, f'./交通局-福州-{year}-{int(month):02d}{Path(filename).suffix}'
# 网络价格 # 网络价格
# if is_excel and is_fuzhou and '网络价格(' in filename and '通闽公司' not in filename and '5.福州' not in filename: elif is_excel and is_fuzhou and '网络价格(' in filename and '通闽公司' not in filename and '5.福州' not in filename:
# print(filename) year, month = get_month(filename)
print(year, month, filename)
yield zip_ref, file_info, f'./网络价格-福州-{year}-{int(month):02d}{Path(filename).suffix}'
# 改性剂 # 改性剂
# if is_excel and '改性剂和沙钢' in filename : elif is_excel and '改性剂和沙钢' in filename :
# print(filename) year, month = get_month(filename)
print(year, month, filename)
yield zip_ref, file_info, f'./改性剂-福州-{year}-{int(month):02d}{Path(filename).suffix}'
# 三明钢铁 # 三明钢铁
# if is_excel and '2.三明钢铁' in filename: elif is_excel and '2.三明钢铁' in filename:
# print(filename) year, month = get_month(filename)
print(year, month, filename)
yield zip_ref, file_info, f'./三明钢铁-福州-{year}-{int(month):02d}{Path(filename).suffix}'
def main():
for zip_ref, file_info, target_file_path in unpack():
with open(target_file_path, 'wb') as f:
f.write(zip_ref.read(file_info))
if target_file_path.endswith('.xls'):
x2x = XLS2XLSX(target_file_path)
x2x.to_xlsx(target_file_path.replace('.xls', '.xlsx'))
class Cleaner:
city = '福州市'
@classmethod
def clean_gxj(cls, table=r'E:\Project\material_api\web\scripts\改性剂-福州-2022-12.xlsx'):
"""
改性剂
"""
wb = load_workbook(file_path)
ws = wb.active
_, _, year, month, *_ = file.split('.')[0].split('-')
name = None
for row in ws.iter_rows(values_only=True):
for k, cell in enumerate(row):
if not name and cell == '日期':
name = row[k+1]
if isinstance(cell, datetime.datetime):
start = datetime.datetime(int(year), int(month), 1)
end = start + relativedelta(months=1)
if start <= cell < end:
print(year, month, name, cell, row[k+1])
yield {
'name': name,
'spec': '',
'price': row[k+1],
'date': cell,
}
@classmethod
def clean_ss(cls, table=r'E:\Project\material_api\web\scripts\三明钢铁-福州-2022-12.xlsx'):
"""
三明钢铁
"""
wb = load_workbook(file_path)
ws = wb.active
_, _, year, month, *_ = file.split('.')[0].split('-')
result = []
for i, row in enumerate(ws.iter_rows(values_only=True)):
if isinstance(row[0], str) and ('材料编号' in row[0] or '三明钢铁' in row[0]):
continue
material, name, spec, _, _, price, *_ = row
if not name:
name = result[-1][3]
# if not spec:
# spec = result[-1][4]
item = (year, month, material, name, spec, price)
result.append(item)
if not item[5]:
continue
print(item)
yield {
'name': name.replace(' ', '') if name else '',
'spec': spec.replace(' ', '') if spec else '',
'material': material,
'price': price,
'fluctuating': None,
'date': datetime.datetime(int(year), int(month), 1),
}
@classmethod
def clean_jtj(cls, table=r'E:\Project\material_api\web\scripts\交通局-福州-2022-12.xlsx'):
"""
交通局
"""
wb = load_workbook(file_path)
ws = wb.active
_, _, year, month, *_ = file.split('.')[0].split('-')
result = []
for i, row in enumerate(ws.iter_rows(values_only=True)):
if isinstance(row[0], str) and ('福建' in row[0] or '' in row[0] or '材料' in row[0] or '电话' in row[0]):
continue
if not row[0] and not row[1] and not row[2]:
continue
material, name, spec, unit, brand, price, *_ = row
# if not name:
# name = result[-1][3]
# if not spec:
# spec = result[-1][4]
item = (year, month, material, name, spec, unit, brand, price)
result.append(item)
if not item[7] or item[7] == '-':
continue
print(item)
yield {
'name': name.replace(' ', '') if name else '',
'spec': spec.replace(' ', '') if spec else '',
'price': price,
'date': datetime.datetime(int(year), int(month), 1),
'material_id': material,
'unit': unit,
'brand': brand,
'region': cls.city.replace('', ''),
}
@classmethod
def clean_glj(cls, table=r'E:\Project\material_api\web\scripts\公路局-福州-2022-12.xlsx'):
"""
公路局
"""
wb = load_workbook(file_path)
ws = wb.active
_, _, year, month, *_ = file.split('.')[0].split('-')
result = []
for i, row in enumerate(ws.iter_rows(values_only=True)):
if isinstance(row[0], str) and ('福建' in row[0] or '' in row[0] or '材料' in row[0] or '电话' in row[0]
or '附件' in row[0] or '填报' in row[0]):
continue
if not row[0] and not row[1] and not row[2]:
continue
material, name, spec, unit, brand, price, source, remark, *_ = row
if not name:
name = result[-1][3]
# if not spec:
# spec = result[-1][4]
item = (year, month, material, name, spec, unit, brand, price)
result.append(item)
if not item[7] or item[7] == '-':
continue
print(item)
yield {
'name': name.replace(' ', '') if name else '',
'spec': spec.replace(' ', '') if spec else '',
'price': price,
'date': datetime.datetime(int(year), int(month), 1),
'material_id': material,
'unit': unit,
'brand': brand,
'region': cls.city.replace('', ''),
}
@classmethod
def clean_dcb(cls, table=r'E:\Project\material_api\web\scripts\调查表-福州-2022-12.xlsx'):
"""
调查表
"""
wb = load_workbook(file_path)
ws = wb.active
_, _, year, month, *_ = file.split('.')[0].split('-')
result = []
for i, row in enumerate(ws.iter_rows(values_only=True)):
print(row)
if isinstance(row[0], str) and ('福建' in row[0] or '' in row[0] or '材料' in row[0] or '电话' in row[0]
or '附件' in row[0] or '填报' in row[0]):
continue
if not row[0] and not row[1] and not row[2]:
continue
material, name, spec, unit, brand, price, price_without_tax, tax, source, remark, *_ = row
# if not name:
# name = result[-1][3]
# # if not spec:
# # spec = result[-1][4]
item = (year, month, material, name, spec, unit, brand, price)
result.append(item)
if not price or price == '-':
continue
print(item)
yield {
'name': name.replace(' ', '') if name else '',
'spec': spec.replace(' ', '') if spec else '',
'price': price,
'date': datetime.datetime(int(year), int(month), 1),
'material_id': material,
'unit': unit,
'brand': brand,
'tax': tax,
'region': cls.city.replace('', ''),
}
@classmethod
def clean_network(cls, table=r'E:\Project\material_api\web\scripts\网络价格-福州-2022-12.xlsx'):
"""
网络价格
"""
wb = load_workbook(file_path)
ws = wb.active
_, _, year, month, *_ = file.split('.')[0].split('-')
result = []
for i, row in enumerate(ws.iter_rows(values_only=True)):
if isinstance(row[0], str) and ('网络' in row[0] or '' in row[0] or '材料' in row[0] or '电话' in row[0]
or '附件' in row[0] or '填报' in row[0]):
continue
if not row[0] and not row[1] and not row[2]:
continue
material, name, spec, unit, brand, price, _, remark, source, *_ = row
# # if not name:
# # name = result[-1][3]
# # # if not spec:
# # # spec = result[-1][4]
item = (year, month, material, name, spec, unit, brand, price, source, remark)
result.append(item)
if not item[7] or item[7] == '-':
continue
print(item)
yield {
'material_id': material,
'spec': spec.replace(' ', '') if spec else '',
'unit': unit,
'brand': brand,
'name': name.replace(' ', '') if name else '',
'price': price,
'source': source,
'remark': remark,
'date': datetime.datetime(int(year), int(month), 1),
'region': cls.city.replace('', ''),
}
@classmethod
def clean_local(cls, table=r'E:\Project\material_api\web\scripts\地材-福州-2022-12.xlsx'):
"""
地材
"""
wb = load_workbook(file_path)
ws = wb.active
_, _, year, month, *_ = file.split('.')[0].split('-')
result = []
for i, row in enumerate(ws.iter_rows(values_only=True)):
if isinstance(row[0], str) and ('汇总表' in row[0] or '' in row[0] or '材料' in row[0] or '电话' in row[0]
or '附件' in row[0] or '填报' in row[0] or '单位' in row[0] or '序号' in row[0]):
continue
if not row[0] and not row[1] and not row[2]:
continue
_, county, name, spec, unit, price, price_without_tax, position, remark, *_ = row
if not county:
county = result[-1][2]
if not name:
name = result[-1][3]
item = (year, month, county, name, spec, unit, price, price_without_tax, position, remark)
result.append(item)
if not price or price in ('-', '', '/'):
continue
if not price_without_tax or price_without_tax in ('-', '', '/'):
price_without_tax = 0
print(item)
yield {
'name': name.replace(' ', '') if name else '',
'city': cls.city,
'county': county,
'material_id': '',
'spec': spec.replace(' ', '') if spec else '',
'unit': unit,
'price': price,
'price_without_tax': price_without_tax,
'date': datetime.datetime(int(year), int(month), 1),
'position': position,
'remark': remark,
}
def get_month(filename): def get_month(filename):
@@ -50,4 +329,29 @@ def get_month(filename):
if __name__ == '__main__': if __name__ == '__main__':
main() from core.factory import ClientApp
with ClientApp().app_context():
for root, dirs, files in os.walk(r'E:\Project\material_api\web\scripts'):
for file in files:
file_path = os.path.join(root, file)
if '改性剂' in file and '.xlsx' in file:
for item in Cleaner.clean_gxj(file_path):
AsphaltModifier(**item).upsert()
if '三明钢铁' in file and '.xlsx' in file:
for item in Cleaner.clean_ss(file_path):
SanmingSteel(**item).upsert()
if '交通局' in file and '.xlsx' in file:
for item in Cleaner.clean_jtj(file_path):
FuzhouTransportationBureau(**item).upsert()
if '公路局' in file and '.xlsx' in file:
for item in Cleaner.clean_glj(file_path):
FuzhouHighwayBureau(**item).upsert()
if '调查表' in file and '.xlsx' in file:
for item in Cleaner.clean_dcb(file_path):
FujianSurvey(**item).upsert()
if '网络价格' in file and '.xlsx' in file:
for item in Cleaner.clean_network(file_path):
DataNetwork(**item).upsert()
if '地材' in file and '.xlsx' in file:
for item in Cleaner.clean_local(file_path):
LocalMaterial(**item).upsert()