Browse Source

爬取气象局天气预报截图

huangyan 10 months ago
commit
ae03b29087
14 changed files with 322 additions and 0 deletions
  1. 8 0
      .idea/.gitignore
  2. 6 0
      .idea/misc.xml
  3. 8 0
      .idea/modules.xml
  4. 15 0
      .idea/nmc.iml
  5. 0 0
      nmc/__init__.py
  6. 3 0
      nmc/cmd.py
  7. 35 0
      nmc/compress_image.py
  8. 12 0
      nmc/items.py
  9. 24 0
      nmc/middlewares.py
  10. 13 0
      nmc/pipelines.py
  11. 96 0
      nmc/settings.py
  12. 4 0
      nmc/spiders/__init__.py
  13. 87 0
      nmc/spiders/nmcspider.py
  14. 11 0
      scrapy.cfg

+ 8 - 0
.idea/.gitignore

@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml

+ 6 - 0
.idea/misc.xml

@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Black">
+    <option name="sdkName" value="Python 3.11" />
+  </component>
+</project>

+ 8 - 0
.idea/modules.xml

@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/nmc.iml" filepath="$PROJECT_DIR$/.idea/nmc.iml" />
+    </modules>
+  </component>
+</project>

+ 15 - 0
.idea/nmc.iml

@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="WEB_MODULE" version="4">
+  <component name="FacetManager">
+    <facet type="Python" name="Python facet">
+      <configuration sdkName="Python 3.11" />
+    </facet>
+  </component>
+  <component name="Go" enabled="true" />
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+    <orderEntry type="library" name="Python 3.11 interpreter library" level="application" />
+  </component>
+</module>

+ 0 - 0
nmc/__init__.py


+ 3 - 0
nmc/cmd.py

@@ -0,0 +1,3 @@
+from scrapy import cmdline
+
+cmdline.execute("scrapy crawl nmcspider".split())

+ 35 - 0
nmc/compress_image.py

@@ -0,0 +1,35 @@
+import io
+
+from PIL import Image
+from selenium.webdriver.common.action_chains import ActionChains
+from selenium.webdriver.common.keys import Keys
+
+def compress_image(image_path, output_path, quality=90, resize=None):
+    """压缩图片并保存到指定路径。
+
+    :param image_path: 原始图片路径
+    :param output_path: 压缩后图片路径
+    :param quality: 压缩质量,范围1-100,默认90,值越小图片越小但质量越差
+    """
+    img = Image.open(image_path)
+    if resize:
+        img.thumbnail(resize)
+    img = Image.open(image_path)
+    img.save(output_path, optimize=True, quality=quality)
+
+def scroll_to_element(driver, element):
+    """滚动到指定元素位置。"""
+    driver.execute_script("arguments[0].scrollIntoView();", element)
+def capture_element_screenshot(driver, element, filename):
+    """Captures screenshot of the given element and saves it."""
+    location = element.location
+    size = element.size
+    screenshot = driver.get_screenshot_as_png()  # 获取整个页面的截图
+    image = Image.open(io.BytesIO(screenshot))
+    # 计算截图中元素对应的位置并裁剪
+    left = location['x']
+    top = location['y']
+    right = location['x'] + size['width']
+    bottom = location['y'] + size['height']
+    cropped_image = image.crop((left, top, right, bottom)) # 注意这里的顺序是(left, top, right, bottom),可能需要调整为(left, top, right, bottom)取决于坐标系
+    cropped_image.save(filename)

+ 12 - 0
nmc/items.py

@@ -0,0 +1,12 @@
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class NmcItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    pass

+ 24 - 0
nmc/middlewares.py

@@ -0,0 +1,24 @@
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+from selenium import webdriver
+# useful for handling different item types with a single interface
+from itemadapter import is_item, ItemAdapter
+from scrapy import signals
+from scrapy.exceptions import IgnoreRequest
+
+
+class SeleniumDownloaderMiddleware:
+    def _init_(self):
+        # 创建driver
+        chrome_path = r'E:\ProgramData\anaconda3\chromedriver.exe'
+        self.driver = webdriver.Chrome(executable_path=chrome_path)
+        pass
+
+    def process_request(self, request, spider):
+        # 通过driver访问第一个链接
+        self.driver.get(request.url)
+        print(request.url, "中间件")

+ 13 - 0
nmc/pipelines.py

@@ -0,0 +1,13 @@
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+# useful for handling different item types with a single interface
+from itemadapter import ItemAdapter
+
+
+class NmcPipeline:
+    def process_item(self, item, spider):
+        return item

+ 96 - 0
nmc/settings.py

@@ -0,0 +1,96 @@
+# Scrapy settings for nmc project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://docs.scrapy.org/en/latest/topics/settings.html
+#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = "nmc"
+
+SPIDER_MODULES = ["nmc.spiders"]
+NEWSPIDER_MODULE = "nmc.spiders"
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0"
+REACTOR = 'twisted.internet.selectreactor.SelectReactor'
+LOG_LEVEL = 'DEBUG'
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = False
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+DOWNLOAD_DELAY = 3
+RANDOMIZE_DOWNLOAD_DELAY = True
+# The download delay setting will honor only one of:
+# CONCURRENT_REQUESTS_PER_DOMAIN = 16
+# CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+# TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+DEFAULT_REQUEST_HEADERS = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
+    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+    'Accept-Language': 'en',
+}
+
+# Enable or disable spider middlewares
+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+# SPIDER_MIDDLEWARES = {
+#     "nmc.middlewares.NmcSpiderMiddleware": 543,
+# }
+
+# Enable or disable downloader middlewares
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+# DOWNLOADER_MIDDLEWARES = {
+#     # "nmc.middlewares.NmcDownloaderMiddleware": 543,
+#     "nmc.middlewares.SeleniumDownloaderMiddleware": 543,
+#     'scrapy.downloadermiddlewares.useragent.UseragentMiddleware': None
+# }
+
+# Enable or disable extensions
+# See https://docs.scrapy.org/en/latest/topics/extensions.html
+# EXTENSIONS = {
+#    "scrapy.extensions.telnet.TelnetConsole": None,
+# }
+
+# Configure item pipelines
+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+    "nmc.pipelines.NmcPipeline": 300,
+}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
+# AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+# AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+# AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+# AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+# HTTPCACHE_ENABLED = True
+# HTTPCACHE_EXPIRATION_SECS = 0
+# HTTPCACHE_DIR = "httpcache"
+# HTTPCACHE_IGNORE_HTTP_CODES = []
+# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
+
+# Set settings whose default value is deprecated to a future-proof value
+REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
+TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
+FEED_EXPORT_ENCODING = "utf-8"

+ 4 - 0
nmc/spiders/__init__.py

@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.

+ 87 - 0
nmc/spiders/nmcspider.py

@@ -0,0 +1,87 @@
+import io
+import os.path
+
+import scrapy
+from PIL import Image
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.wait import WebDriverWait
+
+
+class NmcspiderSpider(scrapy.Spider):
+    name = "nmcspider"
+    allowed_domains = ["www.nmc.cn"]
+    start_urls = ["http://www.nmc.cn/rest/province"]
+
+    def start_requests(self):
+        yield scrapy.Request(url=self.start_urls[0], callback=self.parse_provinces)
+
+    def __init__(self):
+        chrome_path = r'E:\chromedriver_win32\chromedriver.exe'
+        chrome_options = Options()
+        # 添加路径到chrome_options
+        chrome_options.add_argument('--webdriver-executable-path=' + chrome_path)
+        # 如果需要添加其他选项,例如禁用浏览器窗口,可以这样:
+        # chrome_options.add_argument('--headless')
+        # chrome_options.add_argument('--disable-gpu')
+        self.driver = webdriver.Chrome(options=chrome_options)
+
+    def closed(self, reason):
+        self.driver.quit()
+
+    def parse_provinces(self, response):
+        provinces_data = response.json()
+        # 遍历省份数据,为每个省份生成新的请求
+        for province in provinces_data:
+            # 假设每个省份有url字段,基于此创建新的请求
+            province_url = province.get('code')  # 使用实际的键名,根据实际返回的JSON结构
+            if province_url:
+                yield scrapy.Request(url="http://www.nmc.cn/rest/province/" + province_url,
+                                     callback=self.parse_province_details)
+
+    def parse_province_details(self, response):
+        provinces_data = response.json()
+
+        for provinces in provinces_data:
+            province_url = provinces.get('url')
+            province = provinces.get('province')
+            city = provinces.get('city')
+            if province_url:
+                yield scrapy.Request(url="http://www.nmc.cn" + province_url, callback=self.parse,
+                                     meta={"province": province, "city": city})
+
+    def parse(self, response):
+        # 省份名称
+        province = response.meta['province']
+        # 城市名称
+        city = response.meta['city']
+        self.driver.get(response.url)
+        self.driver.set_window_size(1920, 1080)
+        title = response.xpath('//*[@id="realChart"]/div[1]/span[1]/text()').get()
+        # 创建文件目录
+        path_to_save = os.path.join(province, city)
+        full_path = os.path.join(".", path_to_save)
+        if not os.path.exists(full_path):
+            os.makedirs(full_path)
+        try:
+            elemen = self.driver.find_element(By.ID, 'forecastChart')
+            self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", elemen)
+            element = WebDriverWait(self.driver, 20).until(
+                EC.presence_of_element_located((By.ID, 'forecastChart')))
+            location = elemen.location
+            size = elemen.size
+            # 获取天气预报图
+            # 使用PIL和numpy处理图像
+            screenshot = self.driver.get_screenshot_as_png()
+            image = Image.open(io.BytesIO(screenshot))
+            crop_area = (location['x'], location['y'], location['x'] + size['width'], location['y'] + size['height'])
+            print(crop_area, "=========================")
+            cropped_image = image.crop(crop_area)
+            # 保存裁剪后的图片
+            image_path = os.path.join(full_path, city + '天气预报图.png')
+            cropped_image.save(image_path)
+
+        finally:
+            print("关闭浏览器")

+ 11 - 0
scrapy.cfg

@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = nmc.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = nmc