|
@@ -0,0 +1,133 @@
|
|
|
|
+import pandas as pd
|
|
|
|
+import pdfplumber
|
|
|
|
+import re
|
|
|
|
+from datetime import datetime
|
|
|
|
+
|
|
|
|
+import tabula
|
|
|
|
+
|
|
|
|
+ALLOWED_EXTENSIONS = {'pdf'}
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def allowed_file(filename):
|
|
|
|
+ return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def safe_filename(filename):
|
|
|
|
+ """生成安全的文件名,同时保留中文"""
|
|
|
|
+ # 保留中文、字母、数字、下划线和点
|
|
|
|
+ keep_chars = (' ', '.', '_', '-')
|
|
|
|
+ filename = "".join(c for c in filename if c.isalnum() or c in keep_chars).rstrip()
|
|
|
|
+ return filename
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def get_pdf_page_count(pdf_path):
|
|
|
|
+ with pdfplumber.open(pdf_path) as pdf:
|
|
|
|
+ page_count = len(pdf.pages)
|
|
|
|
+ return page_count
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def extract_temp_time(pdf_path):
|
|
|
|
+ """第一种处理方法:基于文本分割的提取"""
|
|
|
|
+ cleaned_data = []
|
|
|
|
+ with pdfplumber.open(pdf_path) as pdf:
|
|
|
|
+ for page in pdf.pages:
|
|
|
|
+ text = page.extract_text()
|
|
|
|
+ if text:
|
|
|
|
+ text_list = text.split("\n")
|
|
|
|
+ for txt in text_list:
|
|
|
|
+ if ("历史数据表" not in txt) and ("时间" not in txt):
|
|
|
|
+ foo = [p for p in re.split(r'\s{1,}', txt.strip()) if p]
|
|
|
|
+ if len(foo) < 5:
|
|
|
|
+ print(foo)
|
|
|
|
+ continue
|
|
|
|
+ date_time, name, ids, temp, humi = foo[0] + " " + foo[1], foo[2], foo[3], foo[4], foo[5]
|
|
|
|
+ if foo[5] == "--":
|
|
|
|
+ humi = ""
|
|
|
|
+ cleaned_data.append([date_time, name, ids, temp, humi])
|
|
|
|
+
|
|
|
|
+ df = pd.DataFrame(
|
|
|
|
+ cleaned_data,
|
|
|
|
+ columns=['时间', '名称', '编号', '温度', '湿度']
|
|
|
|
+ )
|
|
|
|
+ df = df.sort_values('时间').reset_index(drop=True)
|
|
|
|
+ return df
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def extract_pdf_table_to_excel(pdf_path):
|
|
|
|
+ """第二种处理方法:基于表格提取"""
|
|
|
|
+ cleaned_data = []
|
|
|
|
+ with pdfplumber.open(pdf_path) as pdf:
|
|
|
|
+ for page in pdf.pages[2:]:
|
|
|
|
+ tables = page.extract_table()
|
|
|
|
+ if tables:
|
|
|
|
+ if len(tables) >= 2:
|
|
|
|
+ for table in tables[1:]:
|
|
|
|
+ for row in table:
|
|
|
|
+ for cell in row.split('\n'):
|
|
|
|
+ foo = str(cell).strip().split(" ")
|
|
|
|
+ if len(foo) == 4:
|
|
|
|
+ date_time, temp, humi = foo[0].replace("/", "-") + " " + foo[1], foo[2], foo[3]
|
|
|
|
+ # 拆分日期和时间
|
|
|
|
+ cleaned_data.append([date_time, temp, humi])
|
|
|
|
+
|
|
|
|
+ result_df = pd.DataFrame(
|
|
|
|
+ cleaned_data,
|
|
|
|
+ columns=['时间', '温度', '湿度']
|
|
|
|
+ )
|
|
|
|
+ result_df = result_df.sort_values('时间').reset_index(drop=True)
|
|
|
|
+ return result_df
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def extract_temp_by_datetime_pattern(pdf_path):
|
|
|
|
+ """第三种处理方法:基于日期时间模式和温度符号的提取"""
|
|
|
|
+ all_data = []
|
|
|
|
+ datetime_pattern = re.compile(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}')
|
|
|
|
+
|
|
|
|
+ with pdfplumber.open(pdf_path) as pdf:
|
|
|
|
+ # 从第二页开始处理(索引1)
|
|
|
|
+ for page in pdf.pages[1:]:
|
|
|
|
+ text = page.extract_text()
|
|
|
|
+ if not text:
|
|
|
|
+ continue
|
|
|
|
+
|
|
|
|
+ lines = text.split('\n')
|
|
|
|
+
|
|
|
|
+ for line in lines:
|
|
|
|
+ # 检查行是否包含日期时间格式和温度符号
|
|
|
|
+ if datetime_pattern.search(line) and '℃' in line:
|
|
|
|
+ parts = line.split()
|
|
|
|
+ if len(parts) >= 3:
|
|
|
|
+ # 提取时间部分
|
|
|
|
+ time_str = ' '.join(parts[:2])
|
|
|
|
+ try:
|
|
|
|
+ # 转换为datetime对象
|
|
|
|
+ time = datetime.strptime(time_str, '%Y-%m-%d %H:%M')
|
|
|
|
+ # 提取温度值(去掉℃符号)
|
|
|
|
+ temp_str = parts[2].replace('℃', '')
|
|
|
|
+ try:
|
|
|
|
+ # 添加到数据列表
|
|
|
|
+ all_data.append({'时间': time, '温度': temp_str})
|
|
|
|
+ except ValueError:
|
|
|
|
+ continue
|
|
|
|
+ except ValueError:
|
|
|
|
+ continue
|
|
|
|
+
|
|
|
|
+ if len(parts) >= 6:
|
|
|
|
+ # 提取时间部分
|
|
|
|
+ time_str = ' '.join(parts[3:5])
|
|
|
|
+ try:
|
|
|
|
+ # 转换为datetime对象
|
|
|
|
+ time = datetime.strptime(time_str, '%Y-%m-%d %H:%M')
|
|
|
|
+ # 提取温度值(去掉℃符号)
|
|
|
|
+ temp_str = parts[5].replace('℃', '')
|
|
|
|
+ try:
|
|
|
|
+ # 添加到数据列表
|
|
|
|
+ all_data.append({'时间': time, '温度': temp_str})
|
|
|
|
+ except ValueError:
|
|
|
|
+ continue
|
|
|
|
+ except ValueError:
|
|
|
|
+ continue
|
|
|
|
+
|
|
|
|
+ df = pd.DataFrame(all_data, columns=['时间', '温度'])
|
|
|
|
+ df = df.sort_values('时间').reset_index(drop=True)
|
|
|
|
+ return df
|