Browse Source

图片压缩

huangyan 10 months ago
parent
commit
814224bad8

BIN
default_image.png


BIN
nmc/__pycache__/settings.cpython-311.pyc


+ 40 - 2
nmc/settings.py

@@ -20,16 +20,54 @@ LOG_LEVEL = 'ERROR'
 ROBOTSTXT_OBEY = False
 ROBOTSTXT_OBEY = False
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 CONCURRENT_REQUESTS = 32
 CONCURRENT_REQUESTS = 32
-
+RETRY_TIMES = 3  # 请求失败时重试次数
+DOWNLOAD_TIMEOUT = 15  # 默认下载超时时间,单位秒
 # Configure a delay for requests for the same website (default: 0)
 # Configure a delay for requests for the same website (default: 0)
 # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
 # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
 # See also autothrottle settings and docs
-DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 1
 RANDOMIZE_DOWNLOAD_DELAY = True
 RANDOMIZE_DOWNLOAD_DELAY = True
 # The download delay setting will honor only one of:
 # The download delay setting will honor only one of:
 # CONCURRENT_REQUESTS_PER_DOMAIN = 16
 # CONCURRENT_REQUESTS_PER_DOMAIN = 16
 # CONCURRENT_REQUESTS_PER_IP = 16
 # CONCURRENT_REQUESTS_PER_IP = 16
+# 配置置日志级别,DEBUG, INFO, WARNING, ERROR等,默认为WARNING
+
+# 配置置日志格式
+LOG_FORMAT = '%(asctime)s [%(name)s] %(levelname)s: %(message)s'
+
+# 自定义一个文件处理器,将日志写入文件
+LOG_FILE = 'logs/scrapy.log'  # 指定日志文件路径,相对或绝对路径
+
+# 配置日志处理器,这里只配置了一个文件处理器
+LOG_HANDLERS = {
+    'file': {
+        'class': 'logging.FileHandler',
+        'formatter': 'default',
+        'filename': LOG_FILE,
+    },
+}
+
+# 定义日志处理器格式化器
+LOG_FORMATTERS = {
+    'default': {
+        'format': LOG_FORMAT,
+        'datefmt': '%Y-%m-%d %H:%M:%S',
+    },
+}
 
 
+# 配置日志处理器字典,定义哪些logger使用哪些处理器
+LOGGGER_DELEGATION = {
+    'scrapy': {
+        'handlers': ['file'],
+        'level': LOG_LEVEL,
+        'propagate': False,  # 不传播给父logger
+    },
+    'twisted.core.engine': {  # 可以类似方式配置其他核心组件
+        'handlers': ['file'],
+        'level': LOG_LEVEL,
+        'propagate': False,
+    },
+}
 # Disable cookies (enabled by default)
 # Disable cookies (enabled by default)
 COOKIES_ENABLED = False
 COOKIES_ENABLED = False
 
 

BIN
nmc/spiders/__pycache__/nmcspider.cpython-311.pyc


+ 74 - 64
nmc/spiders/nmcspider.py

@@ -2,11 +2,13 @@ import io
 import os.path
 import os.path
 import time
 import time
 from datetime import datetime
 from datetime import datetime
+from logging import log
 from urllib.parse import urlparse
 from urllib.parse import urlparse
 
 
 import scrapy
 import scrapy
 from PIL import Image
 from PIL import Image
 from selenium import webdriver
 from selenium import webdriver
+from selenium.common import TimeoutException
 from selenium.webdriver import ActionChains
 from selenium.webdriver import ActionChains
 from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.common.by import By
 from selenium.webdriver.common.by import By
@@ -31,7 +33,9 @@ class NmcspiderSpider(scrapy.Spider):
     def __init__(self):
     def __init__(self):
         chrome_path = r'E:\chromedriver_win32\chromedriver.exe'
         chrome_path = r'E:\chromedriver_win32\chromedriver.exe'
         chrome_options = Options()
         chrome_options = Options()
-        # 添加路径到chrome_options
+        # 添加路径到chrome_optionsmmm
+        chrome_options = webdriver.ChromeOptions()
+        chrome_options.add_experimental_option("detach", True)
         chrome_options.add_argument('--webdriver-executable-path=' + chrome_path)
         chrome_options.add_argument('--webdriver-executable-path=' + chrome_path)
         # 如果需要添加其他选项,例如禁用浏览器窗口,可以这样:
         # 如果需要添加其他选项,例如禁用浏览器窗口,可以这样:
         chrome_options.add_argument('--headless')
         chrome_options.add_argument('--headless')
@@ -54,7 +58,6 @@ class NmcspiderSpider(scrapy.Spider):
 
 
     def parse_province_details(self, response):
     def parse_province_details(self, response):
         provinces_data = response.json()
         provinces_data = response.json()
-
         for provinces in provinces_data:
         for provinces in provinces_data:
             province_url = provinces.get('url')
             province_url = provinces.get('url')
             province = provinces.get('province')
             province = provinces.get('province')
@@ -72,59 +75,61 @@ class NmcspiderSpider(scrapy.Spider):
         self.driver.set_window_size(1920, 1080)
         self.driver.set_window_size(1920, 1080)
         # 创建文件目录
         # 创建文件目录
         today_str = datetime.now().strftime("%Y-%m-%d")
         today_str = datetime.now().strftime("%Y-%m-%d")
-        path_to_save = os.path.join(province, city, today_str)
+        path_to_save = os.path.join("resource", province, city, today_str)
         full_path = os.path.join(".", path_to_save)
         full_path = os.path.join(".", path_to_save)
         if not os.path.exists(full_path):
         if not os.path.exists(full_path):
             os.makedirs(full_path)
             os.makedirs(full_path)
+        # 获取第一张图片-天气预报
+        element = WebDriverWait(self.driver, 20).until(
+            EC.presence_of_element_located((By.XPATH, '/html/body/div[2]/div')))
+        self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", element)
+        location = element.location
+        size = element.size
+        # 获取天气预报图
+        # 使用PIL和numpy处理图像
+        screenshot = self.driver.get_screenshot_as_png()
+        image = Image.open(io.BytesIO(screenshot))
+        crop_area = (location['x'], location['y'], location['x'] + size['width'], location['y'] + size['height'])
+        cropped_image = image.crop(crop_area)
+        # 将图像转换为RGB模式,因为JPEG格式需要此模式
+        cropped_image = cropped_image.convert("RGB")
+        # 保存裁剪后的图片
+        url = urlparse(response.url)
+        parts = url.path.split('/')
+        target_part = '_'.join([parts[3], parts[4].split('.')[0]])
+        name = target_part + "_1.png"
+        jpeg_name = name.rsplit('.', 1)[0] + ".jpg"
+        jpeg_path = os.path.join(full_path, jpeg_name)
+        quality = 50
+        # image_path = os.path.join(full_path, name)
+        cropped_image.save(jpeg_path, "JPEG", quality=quality)
+        # 获取第二张图片
+        highcharts = WebDriverWait(self.driver, 20).until(
+            EC.presence_of_element_located((By.ID, 'realChart')))
+        self.driver.execute_script("arguments[0].scrollIntoView({block: 'center', behavior: 'instant'});",
+                                   highcharts)
+        # time.sleep(2)
+        # 获取屏幕截图
+        screenshot = self.driver.get_screenshot_as_png()
+        script = """
+                var element = document.getElementById('realChart');
+                var rect = element.getBoundingClientRect();
+                return {left: rect.left, top: rect.top, width: rect.width, height: rect.height};
+                """
+        position_info = self.driver.execute_script(script)
+        # 获取天气预报图
+        # 使用PIL和numpy处理图像
+        image = Image.open(io.BytesIO(screenshot))
+        crop_area = (
+            position_info['left'], position_info['top'],
+            position_info['left'] + position_info['width'],
+            position_info['top'] + position_info['height'])
+        cropped_image = image.crop(crop_area)
+        # 保存裁剪后的图片
+        name = target_part + "_2.png"
+        image_path = os.path.join(full_path, name)
+        cropped_image.save(image_path)
         try:
         try:
-            # 获取第一张图片-天气预报
-            element = WebDriverWait(self.driver, 20).until(
-                EC.presence_of_element_located((By.XPATH, '/html/body/div[2]/div')))
-            self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", element)
-            location = element.location
-            size = element.size
-            # 获取天气预报图
-            # 使用PIL和numpy处理图像
-            screenshot = self.driver.get_screenshot_as_png()
-            image = Image.open(io.BytesIO(screenshot))
-            crop_area = (location['x'], location['y'], location['x'] + size['width'], location['y'] + size['height'])
-            cropped_image = image.crop(crop_area)
-            # 保存裁剪后的图片
-            url = urlparse(response.url)
-            parts = url.path.split('/')
-            target_part = '_'.join([parts[3], parts[4].split('.')[0]])
-            name = target_part + "_1.png"
-            image_path = os.path.join(full_path, name)
-            cropped_image.save(image_path)
-            # 获取第二张图片
-            highcharts = WebDriverWait(self.driver, 20).until(
-                EC.presence_of_element_located((By.ID, 'realChart')))
-            self.driver.execute_script("arguments[0].scrollIntoView({block: 'center', behavior: 'instant'});",
-                                       highcharts)
-            time.sleep(2)
-            # 等待js全部加载完成
-            js_is_complete(self.driver)
-            WebDriverWait(self.driver, 20).until(js_is_complete)
-            # 获取屏幕截图
-            screenshot = self.driver.get_screenshot_as_png()
-            script = """
-            var element = document.getElementById('realChart');
-            var rect = element.getBoundingClientRect();
-            return {left: rect.left, top: rect.top, width: rect.width, height: rect.height};
-            """
-            position_info = self.driver.execute_script(script)
-            # 获取天气预报图
-            # 使用PIL和numpy处理图像
-            image = Image.open(io.BytesIO(screenshot))
-            crop_area = (
-                position_info['left'], position_info['top'],
-                position_info['left'] + position_info['width'],
-                position_info['top'] + position_info['height'])
-            cropped_image = image.crop(crop_area)
-            # # 保存裁剪后的图片
-            name = target_part + "_2.png"
-            image_path = os.path.join(full_path, name)
-            cropped_image.save(image_path)
             # 第三张图片
             # 第三张图片
             # 等待js全部加载完成
             # 等待js全部加载完成
             js_is_complete(self.driver)
             js_is_complete(self.driver)
@@ -134,24 +139,23 @@ class NmcspiderSpider(scrapy.Spider):
             self.driver.execute_script("arguments[0].scrollIntoView({block: 'center', behavior: 'instant'});",
             self.driver.execute_script("arguments[0].scrollIntoView({block: 'center', behavior: 'instant'});",
                                        highcharts)
                                        highcharts)
             # 找到点击的按钮
             # 找到点击的按钮
+            WebDriverWait(self.driver, 20).until(
+                EC.presence_of_element_located((By.CSS_SELECTOR,
+                                                '#container> .highcharts-container >svg > g > g > g >g.highcharts-legend-item.highcharts-column-series.highcharts-color-undefined.highcharts-series-0')))
             element = self.driver.find_element(By.CSS_SELECTOR,
             element = self.driver.find_element(By.CSS_SELECTOR,
                                                '#container> .highcharts-container >svg > g > g > g >g.highcharts-legend-item.highcharts-column-series.highcharts-color-undefined.highcharts-series-0')
                                                '#container> .highcharts-container >svg > g > g > g >g.highcharts-legend-item.highcharts-column-series.highcharts-color-undefined.highcharts-series-0')
             self.driver.execute_script("arguments[0].dispatchEvent(new Event('click'));",
             self.driver.execute_script("arguments[0].dispatchEvent(new Event('click'));",
                                        element)
                                        element)
-            # target_element = WebDriverWait(self.driver, 20).until(
-            #     EC.visibility_of_element_located((By.CSS_SELECTOR,
-            #                                       "#highcharts-v9zgpzc-36 > svg > g.highcharts-legend > g > g > g.highcharts-legend-item.highcharts-column-series.highcharts-color-undefined.highcharts-series-0 > text")))
-            # target_element.click()
             # 等待点击之后页面加载完成
             # 等待点击之后页面加载完成
             WebDriverWait(self.driver, 20).until(EC.visibility_of_element_located((By.ID, 'container')))
             WebDriverWait(self.driver, 20).until(EC.visibility_of_element_located((By.ID, 'container')))
-            time.sleep(2)
+            # time.sleep(2)
             # 获取屏幕截图
             # 获取屏幕截图
             screenshot = self.driver.get_screenshot_as_png()
             screenshot = self.driver.get_screenshot_as_png()
             scripts = """
             scripts = """
-                var element = document.getElementById('container');
-                var rect = element.getBoundingClientRect();
-                return {left: rect.left, top: rect.top, width: rect.width, height: rect.height};
-                """
+                        var element = document.getElementById('container');
+                        var rect = element.getBoundingClientRect();
+                        return {left: rect.left, top: rect.top, width: rect.width, height: rect.height};
+                        """
             position_info = self.driver.execute_script(scripts)
             position_info = self.driver.execute_script(scripts)
             # 获取天气预报图
             # 获取天气预报图
             # 使用PIL和numpy处理图像
             # 使用PIL和numpy处理图像
@@ -161,10 +165,16 @@ class NmcspiderSpider(scrapy.Spider):
                 position_info['left'] + position_info['width'],
                 position_info['left'] + position_info['width'],
                 position_info['top'] + position_info['height'])
                 position_info['top'] + position_info['height'])
             cropped_image = image.crop(crop_area)
             cropped_image = image.crop(crop_area)
-            # # 保存裁剪后的图片
+            # 保存裁剪后的图片
             name = target_part + "_3.png"
             name = target_part + "_3.png"
             image_path = os.path.join(full_path, name)
             image_path = os.path.join(full_path, name)
             cropped_image.save(image_path)
             cropped_image.save(image_path)
-
-        finally:
-            print("关闭浏览器")
+        except TimeoutException:
+            default_image_path = "/default_image.png"  # 这里填写默认图片的路径
+            backup_image = Image.open(default_image_path)
+            name = target_part + "_3.png"
+            image_path = os.path.join(full_path, name)
+            backup_image.save(image_path)
+            current_url = self.driver.current_url  # 获取当前页面的URL
+            self.logger.error(
+                f"在页面 {current_url} 上,在指定时间内未能找到指定元素使用默认图片")