1 年之前 · ae03b29087
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
 
				+# Default ignored files
			
 
				+/shelf/
			
 
				+/workspace.xml
			
 
				+# Editor-based HTTP Client requests
			
 
				+/httpRequests/
			
 
				+# Datasource local storage ignored files
			
 
				+/dataSources/
			
 
				+/dataSources.local.xml
			
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -0,0 +1,6 @@
 
				+<?xml version="1.0" encoding="UTF-8"?>
			
 
				+<project version="4">
			
 
				+  <component name="Black">
			
 
				+    <option name="sdkName" value="Python 3.11" />
			
 
				+  </component>
			
 
				+</project>
			
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
 
				+<?xml version="1.0" encoding="UTF-8"?>
			
 
				+<project version="4">
			
 
				+  <component name="ProjectModuleManager">
			
 
				+    <modules>
			
 
				+      <module fileurl="file://$PROJECT_DIR$/.idea/nmc.iml" filepath="$PROJECT_DIR$/.idea/nmc.iml" />
			
 
				+    </modules>
			
 
				+  </component>
			
 
				+</project>
			
--- a/.idea/nmc.iml
+++ b/.idea/nmc.iml
@@ -0,0 +1,15 @@
 
				+<?xml version="1.0" encoding="UTF-8"?>
			
 
				+<module type="WEB_MODULE" version="4">
			
 
				+  <component name="FacetManager">
			
 
				+    <facet type="Python" name="Python facet">
			
 
				+      <configuration sdkName="Python 3.11" />
			
 
				+    </facet>
			
 
				+  </component>
			
 
				+  <component name="Go" enabled="true" />
			
 
				+  <component name="NewModuleRootManager">
			
 
				+    <content url="file://$MODULE_DIR$" />
			
 
				+    <orderEntry type="inheritedJdk" />
			
 
				+    <orderEntry type="sourceFolder" forTests="false" />
			
 
				+    <orderEntry type="library" name="Python 3.11 interpreter library" level="application" />
			
 
				+  </component>
			
 
				+</module>
			
--- a/nmc/__init__.py
+++ b/nmc/__init__.py
--- a/nmc/cmd.py
+++ b/nmc/cmd.py
@@ -0,0 +1,3 @@
 
				+from scrapy import cmdline
			
 
				+
			
 
				+cmdline.execute("scrapy crawl nmcspider".split())
			
--- a/nmc/compress_image.py
+++ b/nmc/compress_image.py
@@ -0,0 +1,35 @@
 
				+import io
			
 
				+
			
 
				+from PIL import Image
			
 
				+from selenium.webdriver.common.action_chains import ActionChains
			
 
				+from selenium.webdriver.common.keys import Keys
			
 
				+
			
 
				+def compress_image(image_path, output_path, quality=90, resize=None):
			
 
				+    """压缩图片并保存到指定路径。
			
 
				+
			
 
				+    :param image_path: 原始图片路径
			
 
				+    :param output_path: 压缩后图片路径
			
 
				+    :param quality: 压缩质量，范围1-100，默认90，值越小图片越小但质量越差
			
 
				+    """
			
 
				+    img = Image.open(image_path)
			
 
				+    if resize:
			
 
				+        img.thumbnail(resize)
			
 
				+    img = Image.open(image_path)
			
 
				+    img.save(output_path, optimize=True, quality=quality)
			
 
				+
			
 
				+def scroll_to_element(driver, element):
			
 
				+    """滚动到指定元素位置。"""
			
 
				+    driver.execute_script("arguments[0].scrollIntoView();", element)
			
 
				+def capture_element_screenshot(driver, element, filename):
			
 
				+    """Captures screenshot of the given element and saves it."""
			
 
				+    location = element.location
			
 
				+    size = element.size
			
 
				+    screenshot = driver.get_screenshot_as_png()  # 获取整个页面的截图
			
 
				+    image = Image.open(io.BytesIO(screenshot))
			
 
				+    # 计算截图中元素对应的位置并裁剪
			
 
				+    left = location['x']
			
 
				+    top = location['y']
			
 
				+    right = location['x'] + size['width']
			
 
				+    bottom = location['y'] + size['height']
			
 
				+    cropped_image = image.crop((left, top, right, bottom)) # 注意这里的顺序是(left, top, right, bottom)，可能需要调整为(left, top, right, bottom)取决于坐标系
			
 
				+    cropped_image.save(filename)
			
--- a/nmc/items.py
+++ b/nmc/items.py
@@ -0,0 +1,12 @@
 
				+# Define here the models for your scraped items
			
 
				+#
			
 
				+# See documentation in:
			
 
				+# https://docs.scrapy.org/en/latest/topics/items.html
			
 
				+
			
 
				+import scrapy
			
 
				+
			
 
				+
			
 
				+class NmcItem(scrapy.Item):
			
 
				+    # define the fields for your item here like:
			
 
				+    # name = scrapy.Field()
			
 
				+    pass
			
--- a/nmc/middlewares.py
+++ b/nmc/middlewares.py
@@ -0,0 +1,24 @@
 
				+# Define here the models for your spider middleware
			
 
				+#
			
 
				+# See documentation in:
			
 
				+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
			
 
				+
			
 
				+from scrapy import signals
			
 
				+from selenium import webdriver
			
 
				+# useful for handling different item types with a single interface
			
 
				+from itemadapter import is_item, ItemAdapter
			
 
				+from scrapy import signals
			
 
				+from scrapy.exceptions import IgnoreRequest
			
 
				+
			
 
				+
			
 
				+class SeleniumDownloaderMiddleware:
			
 
				+    def _init_(self):
			
 
				+        # 创建driver
			
 
				+        chrome_path = r'E:\ProgramData\anaconda3\chromedriver.exe'
			
 
				+        self.driver = webdriver.Chrome(executable_path=chrome_path)
			
 
				+        pass
			
 
				+
			
 
				+    def process_request(self, request, spider):
			
 
				+        # 通过driver访问第一个链接
			
 
				+        self.driver.get(request.url)
			
 
				+        print(request.url, "中间件")
			
--- a/nmc/pipelines.py
+++ b/nmc/pipelines.py
@@ -0,0 +1,13 @@
 
				+# Define your item pipelines here
			
 
				+#
			
 
				+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
			
 
				+# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
			
 
				+
			
 
				+
			
 
				+# useful for handling different item types with a single interface
			
 
				+from itemadapter import ItemAdapter
			
 
				+
			
 
				+
			
 
				+class NmcPipeline:
			
 
				+    def process_item(self, item, spider):
			
 
				+        return item
			
--- a/nmc/settings.py
+++ b/nmc/settings.py
@@ -0,0 +1,96 @@
 
				+# Scrapy settings for nmc project
			
 
				+#
			
 
				+# For simplicity, this file contains only settings considered important or
			
 
				+# commonly used. You can find more settings consulting the documentation:
			
 
				+#
			
 
				+#     https://docs.scrapy.org/en/latest/topics/settings.html
			
 
				+#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
			
 
				+#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
			
 
				+
			
 
				+BOT_NAME = "nmc"
			
 
				+
			
 
				+SPIDER_MODULES = ["nmc.spiders"]
			
 
				+NEWSPIDER_MODULE = "nmc.spiders"
			
 
				+
			
 
				+# Crawl responsibly by identifying yourself (and your website) on the user-agent
			
 
				+USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0"
			
 
				+REACTOR = 'twisted.internet.selectreactor.SelectReactor'
			
 
				+LOG_LEVEL = 'DEBUG'
			
 
				+# Obey robots.txt rules
			
 
				+ROBOTSTXT_OBEY = False
			
 
				+# Configure maximum concurrent requests performed by Scrapy (default: 16)
			
 
				+CONCURRENT_REQUESTS = 32
			
 
				+
			
 
				+# Configure a delay for requests for the same website (default: 0)
			
 
				+# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
			
 
				+# See also autothrottle settings and docs
			
 
				+DOWNLOAD_DELAY = 3
			
 
				+RANDOMIZE_DOWNLOAD_DELAY = True
			
 
				+# The download delay setting will honor only one of:
			
 
				+# CONCURRENT_REQUESTS_PER_DOMAIN = 16
			
 
				+# CONCURRENT_REQUESTS_PER_IP = 16
			
 
				+
			
 
				+# Disable cookies (enabled by default)
			
 
				+COOKIES_ENABLED = False
			
 
				+
			
 
				+# Disable Telnet Console (enabled by default)
			
 
				+# TELNETCONSOLE_ENABLED = False
			
 
				+
			
 
				+# Override the default request headers:
			
 
				+DEFAULT_REQUEST_HEADERS = {
			
 
				+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
			
 
				+    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
			
 
				+    'Accept-Language': 'en',
			
 
				+}
			
 
				+
			
 
				+# Enable or disable spider middlewares
			
 
				+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
			
 
				+# SPIDER_MIDDLEWARES = {
			
 
				+#     "nmc.middlewares.NmcSpiderMiddleware": 543,
			
 
				+# }
			
 
				+
			
 
				+# Enable or disable downloader middlewares
			
 
				+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
			
 
				+# DOWNLOADER_MIDDLEWARES = {
			
 
				+#     # "nmc.middlewares.NmcDownloaderMiddleware": 543,
			
 
				+#     "nmc.middlewares.SeleniumDownloaderMiddleware": 543,
			
 
				+#     'scrapy.downloadermiddlewares.useragent.UseragentMiddleware': None
			
 
				+# }
			
 
				+
			
 
				+# Enable or disable extensions
			
 
				+# See https://docs.scrapy.org/en/latest/topics/extensions.html
			
 
				+# EXTENSIONS = {
			
 
				+#    "scrapy.extensions.telnet.TelnetConsole": None,
			
 
				+# }
			
 
				+
			
 
				+# Configure item pipelines
			
 
				+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
			
 
				+ITEM_PIPELINES = {
			
 
				+    "nmc.pipelines.NmcPipeline": 300,
			
 
				+}
			
 
				+
			
 
				+# Enable and configure the AutoThrottle extension (disabled by default)
			
 
				+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
			
 
				+# AUTOTHROTTLE_ENABLED = True
			
 
				+# The initial download delay
			
 
				+# AUTOTHROTTLE_START_DELAY = 5
			
 
				+# The maximum download delay to be set in case of high latencies
			
 
				+# AUTOTHROTTLE_MAX_DELAY = 60
			
 
				+# The average number of requests Scrapy should be sending in parallel to
			
 
				+# each remote server
			
 
				+# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
			
 
				+# Enable showing throttling stats for every response received:
			
 
				+# AUTOTHROTTLE_DEBUG = False
			
 
				+
			
 
				+# Enable and configure HTTP caching (disabled by default)
			
 
				+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
			
 
				+# HTTPCACHE_ENABLED = True
			
 
				+# HTTPCACHE_EXPIRATION_SECS = 0
			
 
				+# HTTPCACHE_DIR = "httpcache"
			
 
				+# HTTPCACHE_IGNORE_HTTP_CODES = []
			
 
				+# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
			
 
				+
			
 
				+# Set settings whose default value is deprecated to a future-proof value
			
 
				+REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
			
 
				+TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
			
 
				+FEED_EXPORT_ENCODING = "utf-8"
			
--- a/nmc/spiders/__init__.py
+++ b/nmc/spiders/__init__.py
@@ -0,0 +1,4 @@
 
				+# This package will contain the spiders of your Scrapy project
			
 
				+#
			
 
				+# Please refer to the documentation for information on how to create and manage
			
 
				+# your spiders.
			
--- a/nmc/spiders/nmcspider.py
+++ b/nmc/spiders/nmcspider.py
@@ -0,0 +1,87 @@
 
				+import io
			
 
				+import os.path
			
 
				+
			
 
				+import scrapy
			
 
				+from PIL import Image
			
 
				+from selenium import webdriver
			
 
				+from selenium.webdriver.chrome.options import Options
			
 
				+from selenium.webdriver.common.by import By
			
 
				+from selenium.webdriver.support import expected_conditions as EC
			
 
				+from selenium.webdriver.support.wait import WebDriverWait
			
 
				+
			
 
				+
			
 
				+class NmcspiderSpider(scrapy.Spider):
			
 
				+    name = "nmcspider"
			
 
				+    allowed_domains = ["www.nmc.cn"]
			
 
				+    start_urls = ["http://www.nmc.cn/rest/province"]
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        yield scrapy.Request(url=self.start_urls[0], callback=self.parse_provinces)
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        chrome_path = r'E:\chromedriver_win32\chromedriver.exe'
			
 
				+        chrome_options = Options()
			
 
				+        # 添加路径到chrome_options
			
 
				+        chrome_options.add_argument('--webdriver-executable-path=' + chrome_path)
			
 
				+        # 如果需要添加其他选项，例如禁用浏览器窗口，可以这样：
			
 
				+        # chrome_options.add_argument('--headless')
			
 
				+        # chrome_options.add_argument('--disable-gpu')
			
 
				+        self.driver = webdriver.Chrome(options=chrome_options)
			
 
				+
			
 
				+    def closed(self, reason):
			
 
				+        self.driver.quit()
			
 
				+
			
 
				+    def parse_provinces(self, response):
			
 
				+        provinces_data = response.json()
			
 
				+        # 遍历省份数据，为每个省份生成新的请求
			
 
				+        for province in provinces_data:
			
 
				+            # 假设每个省份有url字段，基于此创建新的请求
			
 
				+            province_url = province.get('code')  # 使用实际的键名，根据实际返回的JSON结构
			
 
				+            if province_url:
			
 
				+                yield scrapy.Request(url="http://www.nmc.cn/rest/province/" + province_url,
			
 
				+                                     callback=self.parse_province_details)
			
 
				+
			
 
				+    def parse_province_details(self, response):
			
 
				+        provinces_data = response.json()
			
 
				+
			
 
				+        for provinces in provinces_data:
			
 
				+            province_url = provinces.get('url')
			
 
				+            province = provinces.get('province')
			
 
				+            city = provinces.get('city')
			
 
				+            if province_url:
			
 
				+                yield scrapy.Request(url="http://www.nmc.cn" + province_url, callback=self.parse,
			
 
				+                                     meta={"province": province, "city": city})
			
 
				+
			
 
				+    def parse(self, response):
			
 
				+        # 省份名称
			
 
				+        province = response.meta['province']
			
 
				+        # 城市名称
			
 
				+        city = response.meta['city']
			
 
				+        self.driver.get(response.url)
			
 
				+        self.driver.set_window_size(1920, 1080)
			
 
				+        title = response.xpath('//*[@id="realChart"]/div[1]/span[1]/text()').get()
			
 
				+        # 创建文件目录
			
 
				+        path_to_save = os.path.join(province, city)
			
 
				+        full_path = os.path.join(".", path_to_save)
			
 
				+        if not os.path.exists(full_path):
			
 
				+            os.makedirs(full_path)
			
 
				+        try:
			
 
				+            elemen = self.driver.find_element(By.ID, 'forecastChart')
			
 
				+            self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", elemen)
			
 
				+            element = WebDriverWait(self.driver, 20).until(
			
 
				+                EC.presence_of_element_located((By.ID, 'forecastChart')))
			
 
				+            location = elemen.location
			
 
				+            size = elemen.size
			
 
				+            # 获取天气预报图
			
 
				+            # 使用PIL和numpy处理图像
			
 
				+            screenshot = self.driver.get_screenshot_as_png()
			
 
				+            image = Image.open(io.BytesIO(screenshot))
			
 
				+            crop_area = (location['x'], location['y'], location['x'] + size['width'], location['y'] + size['height'])
			
 
				+            print(crop_area, "=========================")
			
 
				+            cropped_image = image.crop(crop_area)
			
 
				+            # 保存裁剪后的图片
			
 
				+            image_path = os.path.join(full_path, city + '天气预报图.png')
			
 
				+            cropped_image.save(image_path)
			
 
				+
			
 
				+        finally:
			
 
				+            print("关闭浏览器")
			
--- a/scrapy.cfg
+++ b/scrapy.cfg
@@ -0,0 +1,11 @@
 
				+# Automatically created by: scrapy startproject
			
 
				+#
			
 
				+# For more information about the [deploy] section see:
			
 
				+# https://scrapyd.readthedocs.io/en/latest/deploy.html
			
 
				+
			
 
				+[settings]
			
 
				+default = nmc.settings
			
 
				+
			
 
				+[deploy]
			
 
				+#url = http://localhost:6800/
			
 
				+project = nmc