3 mēneši atpakaļ · e0dc1fe6c4
--- a/app.py
+++ b/app.py
@@ -9,7 +9,7 @@ from flask import Flask, request, jsonify, send_file
 
				 from add_signature import find_signature_positions, add_signature_to_pdf
			
 
				 from add_watermark import add_watermark_to_pdf
			
 
				 from extract_table import extract_temp_time, extract_pdf_table_to_excel, extract_temp_by_datetime_pattern, allowed_file, \
			
 
				-    safe_filename, extract_temperature_data_from_pdf
			
 
				+    safe_filename, extract_temperature_data_from_pdf, extract_data_from_pdf_5
			
 
				 from lib import Qiniu
			
 
				 from werkzeug.utils import secure_filename
			
 
				 from flask_cors import CORS
			
@@ -132,6 +132,9 @@ def extract_table():
 
				                     df = extract_temp_by_datetime_pattern(filepath)
			
 
				                 if "详细数据" in text:
			
 
				                     df = extract_temperature_data_from_pdf(filepath)
			
 
				+                else:
			
 
				+                    df = extract_data_from_pdf_5(filepath)
			
 
				+
			
 
				 
			
 
				         if df is None:
			
 
				             os.remove(filepath)
			
--- a/extract_table.py
+++ b/extract_table.py
@@ -132,6 +132,7 @@ def extract_temp_by_datetime_pattern(pdf_path):
 
				     df = df.sort_values('时间').reset_index(drop=True)
			
 
				     return df
			
 
				 
			
 
				+
			
 
				 def extract_temperature_data_from_pdf(pdf_path):
			
 
				     """
			
 
				     从PDF文件中提取时间和温度数据
			
@@ -157,4 +158,80 @@ def extract_temperature_data_from_pdf(pdf_path):
 
				     df = pd.DataFrame(all_data, columns=['时间', '温度'])
			
 
				     # 按时间排序
			
 
				     df = df.sort_values('时间').reset_index(drop=True)
			
 
				-    return df
			
 
				+    return df
			
 
				+
			
 
				+
			
 
				+# 第五种处理方法
			
 
				+def extract_data_from_pdf_5(pdf_path):
			
 
				+    """
			
 
				+    从PDF文件中提取时间和温度数据
			
 
				+    """
			
 
				+    cleaned_data = []
			
 
				+    valid_table_lines = []  # 存储目标文件中有效表格行
			
 
				+    with pdfplumber.open(pdf_path) as pdf:
			
 
				+        for page in pdf.pages:
			
 
				+            page_content = page.extract_text()
			
 
				+            if not page_content:
			
 
				+                continue  # 跳过空页
			
 
				+            # 按换行符分割为单行，去除首尾空格，排除页码行（如“第1页/共3页”）
			
 
				+            page_lines = [
			
 
				+                line.strip() for line in page_content.split("\n")
			
 
				+                if "第" not in line or "页" not in line
			
 
				+            ]
			
 
				+            for line in page_lines:
			
 
				+                if '时间' in line:
			
 
				+                    continue
			
 
				+                # 筛选条件：含时间（任意年月日时分）+ 温度（数字℃）+ 湿度（数字%）特征
			
 
				+                has_time = re.search(r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}", line)
			
 
				+                has_temp = re.search(r"\d+\.?\d*℃", line)
			
 
				+                has_humi = re.search(r"\d+\.?\d*%", line)
			
 
				+                if has_time and has_temp:
			
 
				+                    valid_table_lines.append(line)
			
 
				+
			
 
				+
			
 
				+    for line_idx, line_content in enumerate(valid_table_lines, 1):
			
 
				+        parts = line_content.split(" ")
			
 
				+        if len(parts) == 4:
			
 
				+            data = {
			
 
				+                "时间": parts[0] + " " + parts[1],
			
 
				+                "温度": parts[2].replace("℃", ""),
			
 
				+            }
			
 
				+            cleaned_data.append(data)
			
 
				+        if len(parts) == 8:
			
 
				+            data1 = {
			
 
				+                "时间": parts[0] + " " + parts[1],
			
 
				+                "温度": parts[2].replace("℃", ""),
			
 
				+            }
			
 
				+            data2 = {
			
 
				+                "时间": parts[4] + " " + parts[5],
			
 
				+                "温度": parts[6].replace("℃", ""),
			
 
				+            }
			
 
				+            cleaned_data.append(data1)
			
 
				+            cleaned_data.append(data2)
			
 
				+        if len(parts) == 6:
			
 
				+            data = {
			
 
				+                "时间": parts[0] + " " + parts[1],
			
 
				+                "温度": parts[2].replace("℃", ""),
			
 
				+                "湿度": parts[4].replace("%", ""),
			
 
				+            }
			
 
				+            cleaned_data.append(data)
			
 
				+        if len(parts) == 12:
			
 
				+            data1 = {
			
 
				+                "时间": parts[0] + " " + parts[1],
			
 
				+                "温度": parts[2].replace("℃", ""),
			
 
				+                "湿度": parts[4].replace("%", ""),
			
 
				+            }
			
 
				+            data2 = {
			
 
				+                "时间": parts[6] + " " + parts[7],
			
 
				+                "温度": parts[8].replace("℃", ""),
			
 
				+                "湿度": parts[10].replace("%", ""),
			
 
				+            }
			
 
				+            cleaned_data.append(data1)
			
 
				+            cleaned_data.append(data2)
			
 
				+
			
 
				+
			
 
				+    # 转换为DataFrame
			
 
				+    df = pd.DataFrame(cleaned_data, columns=['时间', '温度', '湿度'])
			
 
				+    # 按时间排序
			
 
				+    df = df.sort_values('时间').reset_index(drop=True)
			
 
				+    return df