|
|
@@ -132,6 +132,7 @@ def extract_temp_by_datetime_pattern(pdf_path):
|
|
|
df = df.sort_values('时间').reset_index(drop=True)
|
|
|
return df
|
|
|
|
|
|
+
|
|
|
def extract_temperature_data_from_pdf(pdf_path):
|
|
|
"""
|
|
|
从PDF文件中提取时间和温度数据
|
|
|
@@ -157,4 +158,80 @@ def extract_temperature_data_from_pdf(pdf_path):
|
|
|
df = pd.DataFrame(all_data, columns=['时间', '温度'])
|
|
|
# 按时间排序
|
|
|
df = df.sort_values('时间').reset_index(drop=True)
|
|
|
- return df
|
|
|
+ return df
|
|
|
+
|
|
|
+
|
|
|
+# 第五种处理方法
|
|
|
+def extract_data_from_pdf_5(pdf_path):
|
|
|
+ """
|
|
|
+ 从PDF文件中提取时间和温度数据
|
|
|
+ """
|
|
|
+ cleaned_data = []
|
|
|
+ valid_table_lines = [] # 存储目标文件中有效表格行
|
|
|
+ with pdfplumber.open(pdf_path) as pdf:
|
|
|
+ for page in pdf.pages:
|
|
|
+ page_content = page.extract_text()
|
|
|
+ if not page_content:
|
|
|
+ continue # 跳过空页
|
|
|
+ # 按换行符分割为单行,去除首尾空格,排除页码行(如“第1页/共3页”)
|
|
|
+ page_lines = [
|
|
|
+ line.strip() for line in page_content.split("\n")
|
|
|
+ if "第" not in line or "页" not in line
|
|
|
+ ]
|
|
|
+ for line in page_lines:
|
|
|
+ if '时间' in line:
|
|
|
+ continue
|
|
|
+ # 筛选条件:含时间(任意年月日时分)+ 温度(数字℃)+ 湿度(数字%)特征
|
|
|
+ has_time = re.search(r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}", line)
|
|
|
+ has_temp = re.search(r"\d+\.?\d*℃", line)
|
|
|
+ has_humi = re.search(r"\d+\.?\d*%", line)
|
|
|
+ if has_time and has_temp:
|
|
|
+ valid_table_lines.append(line)
|
|
|
+
|
|
|
+
|
|
|
+ for line_idx, line_content in enumerate(valid_table_lines, 1):
|
|
|
+ parts = line_content.split(" ")
|
|
|
+ if len(parts) == 4:
|
|
|
+ data = {
|
|
|
+ "时间": parts[0] + " " + parts[1],
|
|
|
+ "温度": parts[2].replace("℃", ""),
|
|
|
+ }
|
|
|
+ cleaned_data.append(data)
|
|
|
+ if len(parts) == 8:
|
|
|
+ data1 = {
|
|
|
+ "时间": parts[0] + " " + parts[1],
|
|
|
+ "温度": parts[2].replace("℃", ""),
|
|
|
+ }
|
|
|
+ data2 = {
|
|
|
+ "时间": parts[4] + " " + parts[5],
|
|
|
+ "温度": parts[6].replace("℃", ""),
|
|
|
+ }
|
|
|
+ cleaned_data.append(data1)
|
|
|
+ cleaned_data.append(data2)
|
|
|
+ if len(parts) == 6:
|
|
|
+ data = {
|
|
|
+ "时间": parts[0] + " " + parts[1],
|
|
|
+ "温度": parts[2].replace("℃", ""),
|
|
|
+ "湿度": parts[4].replace("%", ""),
|
|
|
+ }
|
|
|
+ cleaned_data.append(data)
|
|
|
+ if len(parts) == 12:
|
|
|
+ data1 = {
|
|
|
+ "时间": parts[0] + " " + parts[1],
|
|
|
+ "温度": parts[2].replace("℃", ""),
|
|
|
+ "湿度": parts[4].replace("%", ""),
|
|
|
+ }
|
|
|
+ data2 = {
|
|
|
+ "时间": parts[6] + " " + parts[7],
|
|
|
+ "温度": parts[8].replace("℃", ""),
|
|
|
+ "湿度": parts[10].replace("%", ""),
|
|
|
+ }
|
|
|
+ cleaned_data.append(data1)
|
|
|
+ cleaned_data.append(data2)
|
|
|
+
|
|
|
+
|
|
|
+ # 转换为DataFrame
|
|
|
+ df = pd.DataFrame(cleaned_data, columns=['时间', '温度', '湿度'])
|
|
|
+ # 按时间排序
|
|
|
+ df = df.sort_values('时间').reset_index(drop=True)
|
|
|
+ return df
|