Browse Source

图片压缩

huangyan 10 months ago
parent
commit
814224bad8

BIN
default_image.png


BIN
nmc/__pycache__/settings.cpython-311.pyc


+ 40 - 2
nmc/settings.py

@@ -20,16 +20,54 @@ LOG_LEVEL = 'ERROR'
 ROBOTSTXT_OBEY = False
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 CONCURRENT_REQUESTS = 32
-
+RETRY_TIMES = 3  # 请求失败时重试次数
+DOWNLOAD_TIMEOUT = 15  # 默认下载超时时间,单位秒
 # Configure a delay for requests for the same website (default: 0)
 # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 1
 RANDOMIZE_DOWNLOAD_DELAY = True
 # The download delay setting will honor only one of:
 # CONCURRENT_REQUESTS_PER_DOMAIN = 16
 # CONCURRENT_REQUESTS_PER_IP = 16
+# 配置置日志级别,DEBUG, INFO, WARNING, ERROR等,默认为WARNING
+
+# 配置置日志格式
+LOG_FORMAT = '%(asctime)s [%(name)s] %(levelname)s: %(message)s'
+
+# 自定义一个文件处理器,将日志写入文件
+LOG_FILE = 'logs/scrapy.log'  # 指定日志文件路径,相对或绝对路径
+
+# 配置日志处理器,这里只配置了一个文件处理器
+LOG_HANDLERS = {
+    'file': {
+        'class': 'logging.FileHandler',
+        'formatter': 'default',
+        'filename': LOG_FILE,
+    },
+}
+
+# 定义日志处理器格式化器
+LOG_FORMATTERS = {
+    'default': {
+        'format': LOG_FORMAT,
+        'datefmt': '%Y-%m-%d %H:%M:%S',
+    },
+}
 
+# 配置日志处理器字典,定义哪些logger使用哪些处理器
+LOGGGER_DELEGATION = {
+    'scrapy': {
+        'handlers': ['file'],
+        'level': LOG_LEVEL,
+        'propagate': False,  # 不传播给父logger
+    },
+    'twisted.core.engine': {  # 可以类似方式配置其他核心组件
+        'handlers': ['file'],
+        'level': LOG_LEVEL,
+        'propagate': False,
+    },
+}
 # Disable cookies (enabled by default)
 COOKIES_ENABLED = False
 

BIN
nmc/spiders/__pycache__/nmcspider.cpython-311.pyc


+ 74 - 64
nmc/spiders/nmcspider.py

@@ -2,11 +2,13 @@ import io
 import os.path
 import time
 from datetime import datetime
+from logging import log
 from urllib.parse import urlparse
 
 import scrapy
 from PIL import Image
 from selenium import webdriver
+from selenium.common import TimeoutException
 from selenium.webdriver import ActionChains
 from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.common.by import By
@@ -31,7 +33,9 @@ class NmcspiderSpider(scrapy.Spider):
     def __init__(self):
         chrome_path = r'E:\chromedriver_win32\chromedriver.exe'
         chrome_options = Options()
-        # 添加路径到chrome_options
+        # 添加路径到chrome_optionsmmm
+        chrome_options = webdriver.ChromeOptions()
+        chrome_options.add_experimental_option("detach", True)
         chrome_options.add_argument('--webdriver-executable-path=' + chrome_path)
         # 如果需要添加其他选项,例如禁用浏览器窗口,可以这样:
         chrome_options.add_argument('--headless')
@@ -54,7 +58,6 @@ class NmcspiderSpider(scrapy.Spider):
 
     def parse_province_details(self, response):
         provinces_data = response.json()
-
         for provinces in provinces_data:
             province_url = provinces.get('url')
             province = provinces.get('province')
@@ -72,59 +75,61 @@ class NmcspiderSpider(scrapy.Spider):
         self.driver.set_window_size(1920, 1080)
         # 创建文件目录
         today_str = datetime.now().strftime("%Y-%m-%d")
-        path_to_save = os.path.join(province, city, today_str)
+        path_to_save = os.path.join("resource", province, city, today_str)
         full_path = os.path.join(".", path_to_save)
         if not os.path.exists(full_path):
             os.makedirs(full_path)
+        # 获取第一张图片-天气预报
+        element = WebDriverWait(self.driver, 20).until(
+            EC.presence_of_element_located((By.XPATH, '/html/body/div[2]/div')))
+        self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", element)
+        location = element.location
+        size = element.size
+        # 获取天气预报图
+        # 使用PIL和numpy处理图像
+        screenshot = self.driver.get_screenshot_as_png()
+        image = Image.open(io.BytesIO(screenshot))
+        crop_area = (location['x'], location['y'], location['x'] + size['width'], location['y'] + size['height'])
+        cropped_image = image.crop(crop_area)
+        # 将图像转换为RGB模式,因为JPEG格式需要此模式
+        cropped_image = cropped_image.convert("RGB")
+        # 保存裁剪后的图片
+        url = urlparse(response.url)
+        parts = url.path.split('/')
+        target_part = '_'.join([parts[3], parts[4].split('.')[0]])
+        name = target_part + "_1.png"
+        jpeg_name = name.rsplit('.', 1)[0] + ".jpg"
+        jpeg_path = os.path.join(full_path, jpeg_name)
+        quality = 50
+        # image_path = os.path.join(full_path, name)
+        cropped_image.save(jpeg_path, "JPEG", quality=quality)
+        # 获取第二张图片
+        highcharts = WebDriverWait(self.driver, 20).until(
+            EC.presence_of_element_located((By.ID, 'realChart')))
+        self.driver.execute_script("arguments[0].scrollIntoView({block: 'center', behavior: 'instant'});",
+                                   highcharts)
+        # time.sleep(2)
+        # 获取屏幕截图
+        screenshot = self.driver.get_screenshot_as_png()
+        script = """
+                var element = document.getElementById('realChart');
+                var rect = element.getBoundingClientRect();
+                return {left: rect.left, top: rect.top, width: rect.width, height: rect.height};
+                """
+        position_info = self.driver.execute_script(script)
+        # 获取天气预报图
+        # 使用PIL和numpy处理图像
+        image = Image.open(io.BytesIO(screenshot))
+        crop_area = (
+            position_info['left'], position_info['top'],
+            position_info['left'] + position_info['width'],
+            position_info['top'] + position_info['height'])
+        cropped_image = image.crop(crop_area)
+        # 保存裁剪后的图片
+        name = target_part + "_2.png"
+        image_path = os.path.join(full_path, name)
+        cropped_image.save(image_path)
         try:
-            # 获取第一张图片-天气预报
-            element = WebDriverWait(self.driver, 20).until(
-                EC.presence_of_element_located((By.XPATH, '/html/body/div[2]/div')))
-            self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", element)
-            location = element.location
-            size = element.size
-            # 获取天气预报图
-            # 使用PIL和numpy处理图像
-            screenshot = self.driver.get_screenshot_as_png()
-            image = Image.open(io.BytesIO(screenshot))
-            crop_area = (location['x'], location['y'], location['x'] + size['width'], location['y'] + size['height'])
-            cropped_image = image.crop(crop_area)
-            # 保存裁剪后的图片
-            url = urlparse(response.url)
-            parts = url.path.split('/')
-            target_part = '_'.join([parts[3], parts[4].split('.')[0]])
-            name = target_part + "_1.png"
-            image_path = os.path.join(full_path, name)
-            cropped_image.save(image_path)
-            # 获取第二张图片
-            highcharts = WebDriverWait(self.driver, 20).until(
-                EC.presence_of_element_located((By.ID, 'realChart')))
-            self.driver.execute_script("arguments[0].scrollIntoView({block: 'center', behavior: 'instant'});",
-                                       highcharts)
-            time.sleep(2)
-            # 等待js全部加载完成
-            js_is_complete(self.driver)
-            WebDriverWait(self.driver, 20).until(js_is_complete)
-            # 获取屏幕截图
-            screenshot = self.driver.get_screenshot_as_png()
-            script = """
-            var element = document.getElementById('realChart');
-            var rect = element.getBoundingClientRect();
-            return {left: rect.left, top: rect.top, width: rect.width, height: rect.height};
-            """
-            position_info = self.driver.execute_script(script)
-            # 获取天气预报图
-            # 使用PIL和numpy处理图像
-            image = Image.open(io.BytesIO(screenshot))
-            crop_area = (
-                position_info['left'], position_info['top'],
-                position_info['left'] + position_info['width'],
-                position_info['top'] + position_info['height'])
-            cropped_image = image.crop(crop_area)
-            # # 保存裁剪后的图片
-            name = target_part + "_2.png"
-            image_path = os.path.join(full_path, name)
-            cropped_image.save(image_path)
             # 第三张图片
             # 等待js全部加载完成
             js_is_complete(self.driver)
@@ -134,24 +139,23 @@ class NmcspiderSpider(scrapy.Spider):
             self.driver.execute_script("arguments[0].scrollIntoView({block: 'center', behavior: 'instant'});",
                                        highcharts)
             # 找到点击的按钮
+            WebDriverWait(self.driver, 20).until(
+                EC.presence_of_element_located((By.CSS_SELECTOR,
+                                                '#container> .highcharts-container >svg > g > g > g >g.highcharts-legend-item.highcharts-column-series.highcharts-color-undefined.highcharts-series-0')))
             element = self.driver.find_element(By.CSS_SELECTOR,
                                                '#container> .highcharts-container >svg > g > g > g >g.highcharts-legend-item.highcharts-column-series.highcharts-color-undefined.highcharts-series-0')
             self.driver.execute_script("arguments[0].dispatchEvent(new Event('click'));",
                                        element)
-            # target_element = WebDriverWait(self.driver, 20).until(
-            #     EC.visibility_of_element_located((By.CSS_SELECTOR,
-            #                                       "#highcharts-v9zgpzc-36 > svg > g.highcharts-legend > g > g > g.highcharts-legend-item.highcharts-column-series.highcharts-color-undefined.highcharts-series-0 > text")))
-            # target_element.click()
             # 等待点击之后页面加载完成
             WebDriverWait(self.driver, 20).until(EC.visibility_of_element_located((By.ID, 'container')))
-            time.sleep(2)
+            # time.sleep(2)
             # 获取屏幕截图
             screenshot = self.driver.get_screenshot_as_png()
             scripts = """
-                var element = document.getElementById('container');
-                var rect = element.getBoundingClientRect();
-                return {left: rect.left, top: rect.top, width: rect.width, height: rect.height};
-                """
+                        var element = document.getElementById('container');
+                        var rect = element.getBoundingClientRect();
+                        return {left: rect.left, top: rect.top, width: rect.width, height: rect.height};
+                        """
             position_info = self.driver.execute_script(scripts)
             # 获取天气预报图
             # 使用PIL和numpy处理图像
@@ -161,10 +165,16 @@ class NmcspiderSpider(scrapy.Spider):
                 position_info['left'] + position_info['width'],
                 position_info['top'] + position_info['height'])
             cropped_image = image.crop(crop_area)
-            # # 保存裁剪后的图片
+            # 保存裁剪后的图片
             name = target_part + "_3.png"
             image_path = os.path.join(full_path, name)
             cropped_image.save(image_path)
-
-        finally:
-            print("关闭浏览器")
+        except TimeoutException:
+            default_image_path = "/default_image.png"  # 这里填写默认图片的路径
+            backup_image = Image.open(default_image_path)
+            name = target_part + "_3.png"
+            image_path = os.path.join(full_path, name)
+            backup_image.save(image_path)
+            current_url = self.driver.current_url  # 获取当前页面的URL
+            self.logger.error(
+                f"在页面 {current_url} 上,在指定时间内未能找到指定元素使用默认图片")