extract_table.py 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235
  1. import pandas as pd
  2. import pdfplumber
  3. import re
  4. from datetime import datetime
  5. ALLOWED_EXTENSIONS = {'pdf'}
  6. def allowed_file(filename):
  7. return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
  8. def safe_filename(filename):
  9. """生成安全的文件名,同时保留中文"""
  10. # 保留中文、字母、数字、下划线和点
  11. keep_chars = (' ', '.', '_', '-')
  12. filename = "".join(c for c in filename if c.isalnum() or c in keep_chars).rstrip()
  13. return filename
  14. def get_pdf_page_count(pdf_path):
  15. with pdfplumber.open(pdf_path) as pdf:
  16. page_count = len(pdf.pages)
  17. return page_count
  18. def extract_temp_time(pdf_path):
  19. """第一种处理方法:基于文本分割的提取"""
  20. cleaned_data = []
  21. with pdfplumber.open(pdf_path) as pdf:
  22. for page in pdf.pages:
  23. text = page.extract_text()
  24. if text:
  25. text_list = text.split("\n")
  26. for txt in text_list:
  27. if ("历史数据表" not in txt) and ("时间" not in txt):
  28. foo = [p for p in re.split(r'\s{1,}', txt.strip()) if p]
  29. if len(foo) < 5:
  30. print(foo)
  31. continue
  32. date_time, name, ids, temp, humi = foo[0] + " " + foo[1], foo[2], foo[3], foo[4], foo[5]
  33. if foo[5] == "--":
  34. humi = ""
  35. cleaned_data.append([date_time, name, ids, temp, humi])
  36. df = pd.DataFrame(
  37. cleaned_data,
  38. columns=['时间', '名称', '编号', '温度', '湿度']
  39. )
  40. df = df.sort_values('时间').reset_index(drop=True)
  41. return df
  42. def extract_pdf_table_to_excel(pdf_path):
  43. """第二种处理方法:基于表格提取"""
  44. cleaned_data = []
  45. with pdfplumber.open(pdf_path) as pdf:
  46. for page in pdf.pages[2:]:
  47. tables = page.extract_table()
  48. if tables:
  49. if len(tables) >= 2:
  50. for table in tables[1:]:
  51. for row in table:
  52. for cell in row.split('\n'):
  53. foo = str(cell).strip().split(" ")
  54. if len(foo) == 4:
  55. date_time, temp, humi = foo[0].replace("/", "-") + " " + foo[1], foo[2], foo[3]
  56. # 拆分日期和时间
  57. cleaned_data.append([date_time, temp, humi])
  58. result_df = pd.DataFrame(
  59. cleaned_data,
  60. columns=['时间', '温度', '湿度']
  61. )
  62. result_df = result_df.sort_values('时间').reset_index(drop=True)
  63. return result_df
  64. def extract_temp_by_datetime_pattern(pdf_path):
  65. """第三种处理方法:基于日期时间模式和温度符号的提取"""
  66. all_data = []
  67. datetime_pattern = re.compile(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}')
  68. with pdfplumber.open(pdf_path) as pdf:
  69. # 从第二页开始处理(索引1)
  70. for page in pdf.pages[1:]:
  71. text = page.extract_text()
  72. if not text:
  73. continue
  74. lines = text.split('\n')
  75. for line in lines:
  76. # 检查行是否包含日期时间格式和温度符号
  77. if datetime_pattern.search(line) and '℃' in line:
  78. parts = line.split()
  79. if len(parts) >= 3:
  80. # 提取时间部分
  81. time_str = ' '.join(parts[:2])
  82. try:
  83. # 转换为datetime对象
  84. time = datetime.strptime(time_str, '%Y-%m-%d %H:%M')
  85. # 提取温度值(去掉℃符号)
  86. temp_str = parts[2].replace('℃', '')
  87. try:
  88. # 添加到数据列表
  89. all_data.append({'时间': time, '温度': temp_str})
  90. except ValueError:
  91. continue
  92. except ValueError:
  93. continue
  94. if len(parts) >= 6:
  95. # 提取时间部分
  96. time_str = ' '.join(parts[3:5])
  97. try:
  98. # 转换为datetime对象
  99. time = datetime.strptime(time_str, '%Y-%m-%d %H:%M')
  100. # 提取温度值(去掉℃符号)
  101. temp_str = parts[5].replace('℃', '')
  102. try:
  103. # 添加到数据列表
  104. all_data.append({'时间': time, '温度': temp_str})
  105. except ValueError:
  106. continue
  107. except ValueError:
  108. continue
  109. df = pd.DataFrame(all_data, columns=['时间', '温度'])
  110. df = df.sort_values('时间').reset_index(drop=True)
  111. return df
  112. def extract_temperature_data_from_pdf(pdf_path):
  113. """
  114. 从PDF文件中提取时间和温度数据
  115. """
  116. all_data = []
  117. with pdfplumber.open(pdf_path) as pdf:
  118. for page in pdf.pages:
  119. text = page.extract_text()
  120. # 使用正则表达式匹配数据行
  121. # 匹配模式: 序号 | 日期时间 | 温度 | 状态
  122. for value in text.split("\n"):
  123. pattern = r'(\d+)\s+(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\s+(-?\d+\.\d+|-?\d+)\s+([^\s]+)'
  124. matches = re.findall(pattern, value)
  125. for match in matches:
  126. index, datetime_str, temperature, status = match
  127. all_data.append({
  128. '时间': datetime_str,
  129. '温度': temperature,
  130. })
  131. # 转换为DataFrame
  132. df = pd.DataFrame(all_data, columns=['时间', '温度'])
  133. # 按时间排序
  134. df = df.sort_values('时间').reset_index(drop=True)
  135. return df
  136. # 第五种处理方法
  137. def extract_data_from_pdf_5(pdf_path):
  138. """
  139. 从PDF文件中提取时间和温度数据
  140. """
  141. cleaned_data = []
  142. valid_table_lines = [] # 存储目标文件中有效表格行
  143. with pdfplumber.open(pdf_path) as pdf:
  144. for page in pdf.pages:
  145. page_content = page.extract_text()
  146. if not page_content:
  147. continue # 跳过空页
  148. # 按换行符分割为单行,去除首尾空格,排除页码行(如“第1页/共3页”)
  149. page_lines = [
  150. line.strip() for line in page_content.split("\n")
  151. if "第" not in line or "页" not in line
  152. ]
  153. for line in page_lines:
  154. if '时间' in line:
  155. continue
  156. # 筛选条件:含时间(任意年月日时分)+ 温度(数字℃)+ 湿度(数字%)特征
  157. has_time = re.search(r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}", line)
  158. has_temp = re.search(r"\d+\.?\d*℃", line)
  159. has_humi = re.search(r"\d+\.?\d*%", line)
  160. if has_time and has_temp:
  161. valid_table_lines.append(line)
  162. for line_idx, line_content in enumerate(valid_table_lines, 1):
  163. parts = line_content.split(" ")
  164. if len(parts) == 4:
  165. data = {
  166. "时间": parts[0] + " " + parts[1],
  167. "温度": parts[2].replace("℃", ""),
  168. }
  169. cleaned_data.append(data)
  170. if len(parts) == 8:
  171. data1 = {
  172. "时间": parts[0] + " " + parts[1],
  173. "温度": parts[2].replace("℃", ""),
  174. }
  175. data2 = {
  176. "时间": parts[4] + " " + parts[5],
  177. "温度": parts[6].replace("℃", ""),
  178. }
  179. cleaned_data.append(data1)
  180. cleaned_data.append(data2)
  181. if len(parts) == 6:
  182. data = {
  183. "时间": parts[0] + " " + parts[1],
  184. "温度": parts[2].replace("℃", ""),
  185. "湿度": parts[4].replace("%", ""),
  186. }
  187. cleaned_data.append(data)
  188. if len(parts) == 12:
  189. data1 = {
  190. "时间": parts[0] + " " + parts[1],
  191. "温度": parts[2].replace("℃", ""),
  192. "湿度": parts[4].replace("%", ""),
  193. }
  194. data2 = {
  195. "时间": parts[6] + " " + parts[7],
  196. "温度": parts[8].replace("℃", ""),
  197. "湿度": parts[10].replace("%", ""),
  198. }
  199. cleaned_data.append(data1)
  200. cleaned_data.append(data2)
  201. # 转换为DataFrame
  202. df = pd.DataFrame(cleaned_data, columns=['时间', '温度', '湿度'])
  203. # 按时间排序
  204. df = df.sort_values('时间').reset_index(drop=True)
  205. return df