# Scrapy settings for nmc project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = "nmc" SPIDER_MODULES = ["nmc.spiders"] NEWSPIDER_MODULE = "nmc.spiders" # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0" REACTOR = 'twisted.internet.selectreactor.SelectReactor' LOG_LEVEL = 'ERROR' # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) CONCURRENT_REQUESTS = 32 RETRY_TIMES = 3 # 请求失败时重试次数 DOWNLOAD_TIMEOUT = 15 # 默认下载超时时间,单位秒 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs DOWNLOAD_DELAY = 1 RANDOMIZE_DOWNLOAD_DELAY = True # The download delay setting will honor only one of: # CONCURRENT_REQUESTS_PER_DOMAIN = 16 # CONCURRENT_REQUESTS_PER_IP = 16 # 配置置日志级别,DEBUG, INFO, WARNING, ERROR等,默认为WARNING # 配置置日志格式 LOG_FORMAT = '%(asctime)s [%(name)s] %(levelname)s: %(message)s' # 自定义一个文件处理器,将日志写入文件 LOG_FILE = 'logs/scrapy.log' # 指定日志文件路径,相对或绝对路径 # 配置日志处理器,这里只配置了一个文件处理器 LOG_HANDLERS = { 'file': { 'class': 'logging.FileHandler', 'formatter': 'default', 'filename': LOG_FILE, }, } # 定义日志处理器格式化器 LOG_FORMATTERS = { 'default': { 'format': LOG_FORMAT, 'datefmt': '%Y-%m-%d %H:%M:%S', }, } # 配置日志处理器字典,定义哪些logger使用哪些处理器 LOGGGER_DELEGATION = { 'scrapy': { 'handlers': ['file'], 'level': LOG_LEVEL, 'propagate': False, # 不传播给父logger }, 'twisted.core.engine': { # 可以类似方式配置其他核心组件 'handlers': ['file'], 'level': LOG_LEVEL, 'propagate': False, }, } # Disable cookies (enabled by default) COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) # TELNETCONSOLE_ENABLED = False # Override the default request headers: DEFAULT_REQUEST_HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', } # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html # SPIDER_MIDDLEWARES = { # "nmc.middlewares.NmcSpiderMiddleware": 543, # } # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # DOWNLOADER_MIDDLEWARES = { # # "nmc.middlewares.NmcDownloaderMiddleware": 543, # "nmc.middlewares.SeleniumDownloaderMiddleware": 543, # 'scrapy.downloadermiddlewares.useragent.UseragentMiddleware': None # } # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html # EXTENSIONS = { # "scrapy.extensions.telnet.TelnetConsole": None, # } # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { "nmc.pipelines.NmcPipeline": 300, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html # AUTOTHROTTLE_ENABLED = True # The initial download delay # AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies # AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: # AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings # HTTPCACHE_ENABLED = True # HTTPCACHE_EXPIRATION_SECS = 0 # HTTPCACHE_DIR = "httpcache" # HTTPCACHE_IGNORE_HTTP_CODES = [] # HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" # Set settings whose default value is deprecated to a future-proof value REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7" TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" FEED_EXPORT_ENCODING = "utf-8"