京东商品批量采集系统:整店下载、SKU图提取与主图视频下载 引言很多做电商的朋友在问“能下载京东主图视频的软件推荐”“京东整店商品图片批量保存工具”“支持京东主图视频下载的软件有吗”京东是国内三大电商平台之一商品包含主图、SKU规格图颜色/尺寸、详情图、主图视频等手动采集效率极低。店铺商品众多整店采集需求量大。本文将完整实现一套京东商品批量采集系统涵盖整店商品列表获取、主图提取、SKU图识别、主图视频下载、详情图提取、自动分类、断点续传等核心功能。一键存图正是基于这套技术实现的下载的是原图、原尺寸、原格式无任何压缩、无水印、无MD5篡改。一、京东平台技术特点分析1.1 核心难点难点说明解决方案整店采集需要遍历所有分页分页解析队列管理SKU图颜色/尺寸规格图智能识别SKU容器主图视频mp4直链/m3u8格式视频嗅探m3u8下载懒加载滚动触发图片加载自动滚动触发反爬机制检测非浏览器访问浏览器方案真实指纹1.2 京东图片URL格式python# 京东图片URL示例 # 原图格式 https://img13.360buyimg.com/n0/xxx.jpg https://img14.360buyimg.com/popWaterMark/xxx.jpg # 缩略图格式需要转换 https://img13.360buyimg.com/n1/xxx.jpg https://img13.360buyimg.com/n2/xxx.jpg # 原图规则替换为n0或去除尺寸参数二、京东整店商品列表获取python# jd_shop_parser.py import re import time from typing import List, Dict class JDShopParser: 京东店铺商品列表解析器 def __init__(self, browser_engine): self.browser browser_engine def get_all_product_urls(self, shop_url: str, max_pages: int 100) - List[str]: 获取京东店铺所有商品链接 all_urls [] for page in range(1, max_pages 1): # 京东店铺分页URL格式 page_url f{shop_url}/search-1-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-{page}.html print(f解析第{page}页: {page_url}) browser self.browser.CreateBrowser(page_url) if not self._wait_for_load(browser, 10): break script (function() { const urls []; const selectors [ .gl-item .p-img a, .J_ItemPic a, .product-item a ]; for (const selector of selectors) { const elements document.querySelectorAll(selector); for (const el of elements) { let href el.href; if (href href.includes(item.jd.com)) { urls.push(href); } } if (urls.length 0) break; } return urls; })(); new_urls self._execute_script(browser, script) if not new_urls: break all_urls.extend(new_urls) print(f第{page}页: {len(new_urls)}个商品) time.sleep(2) all_urls list(set(all_urls)) print(f共发现{len(all_urls)}个商品) return all_urls def _wait_for_load(self, browser, timeout: int) - bool: start time.time() while time.time() - start timeout: script document.readyState complete if self._execute_script(browser, script): return True time.sleep(0.5) return False def _execute_script(self, browser, script: str): pass三、京东商品解析引擎javascript// jd_product_extractor.js (function() { use strict; /** * 京东商品解析器 * 支持主图、SKU图、视频、详情图提取 */ class JDProductExtractor { constructor() { this.result { title: , mainImages: [], skuImages: [], // SKU规格图颜色/尺寸 detailImages: [], videos: [] }; this.seenUrls new Set(); } async waitForPageReady() { while (document.readyState ! complete) { await this.sleep(200); } await this.waitForImageContainer(); await this.sleep(1000); } async waitForImageContainer() { let maxWait 30; while (maxWait-- 0) { if (document.querySelector(.spec-img, .J_zoomPic)) { return; } await this.sleep(500); } } sleep(ms) { return new Promise(resolve setTimeout(resolve, ms)); } async triggerLazyLoad() { window.scrollTo(0, document.body.scrollHeight); await this.sleep(500); const step document.body.scrollHeight / 5; for (let i 1; i 5; i) { window.scrollTo(0, i * step); await this.sleep(200); } window.scrollTo(0, 0); await this.sleep(300); } getOriginalUrl(url) { if (!url) return null; if (url.startsWith(data:)) return null; if (url.includes(1x1) || url.includes(blank)) return null; url url.split(?)[0]; // n1/n2 - n0 (原图) url url.replace(/\/n\d\//, /n0/); url url.replace(/_\dx\d\./g, .); return url; } extractTitle() { const selectors [.sku-name, .product-title, h1]; for (const s of selectors) { const el document.querySelector(s); if (el el.textContent) { let title el.textContent.trim(); if (title.length 5) return title; } } return document.title || 京东商品; } extractMainImages() { const images []; // 主图 const mainImg document.querySelector(.spec-img, .J_zoomPic); if (mainImg) { let url mainImg.src || mainImg.getAttribute(data-lazy-img); if (url) { const original this.getOriginalUrl(url); if (original !this.seenUrls.has(original)) { this.seenUrls.add(original); images.push(original); } } } // 缩略图列表 const thumbs document.querySelectorAll(.spec-thumb img, .J_thumImg); for (const thumb of thumbs) { let url thumb.src || thumb.getAttribute(data-lazy-img); if (url) { const original this.getOriginalUrl(url); if (original !this.seenUrls.has(original)) { this.seenUrls.add(original); if (!images.includes(original)) { images.push(original); } } } } return images; } extractSkuImages() { const skuImages []; const skuContainer document.querySelector(.sku-img-list, .J_skuImgList); if (skuContainer) { const items skuContainer.querySelectorAll(.sku-img-item, .J_skuImgItem); for (const item of items) { let name ; const nameEl item.querySelector(.sku-name, .J_skuName); if (nameEl) name nameEl.textContent?.trim(); if (!name) name item.getAttribute(title) || 规格; const img item.querySelector(img); if (img) { let url img.src || img.getAttribute(data-lazy-img); if (url) { const original this.getOriginalUrl(url); if (original !this.seenUrls.has(original)) { this.seenUrls.add(original); skuImages.push({ url: original, name: name }); } } } } } return skuImages; } extractDetailImages() { const images []; const container document.querySelector(#detail, .detail-content, .J_detailContent); if (container) { const imgs container.querySelectorAll(img); for (const img of imgs) { let url img.src || img.getAttribute(data-lazy-img); if (url) { const original this.getOriginalUrl(url); if (original !this.seenUrls.has(original)) { this.seenUrls.add(original); images.push(original); } } } } return images; } extractVideo() { // video标签 const video document.querySelector(.JDV-video video, .video-box video); if (video video.src) { return { url: video.src, type: video.src.endsWith(.mp4) ? mp4 : m3u8 }; } // 页面数据 if (window.pageConfig window.pageConfig.product window.pageConfig.product.videoUrl) { return { url: window.pageConfig.product.videoUrl, type: mp4 }; } return null; } async extract() { await this.waitForPageReady(); await this.triggerLazyLoad(); this.result.title this.extractTitle(); this.result.mainImages this.extractMainImages(); this.result.skuImages this.extractSkuImages(); this.result.detailImages this.extractDetailImages(); const video this.extractVideo(); if (video) this.result.videos.push(video); return this.result; } } return new JDProductExtractor().extract(); })();四、m3u8视频下载器python# m3u8_downloader.py import os, time, requests, m3u8 from concurrent.futures import ThreadPoolExecutor class M3U8Downloader: def __init__(self): self.headers {User-Agent: Mozilla/5.0, Referer: https://item.jd.com/} def download(self, m3u8_url, output_path): playlist m3u8.load(m3u8_url, headersself.headers) base_url /.join(m3u8_url.split(/)[:-1]) / segments [seg.uri if seg.uri.startswith(http) else base_url seg.uri for seg in playlist.segments] temp_dir ftemp_{int(time.time())} os.makedirs(temp_dir, exist_okTrue) ts_files [] with ThreadPoolExecutor(max_workers10) as executor: futures [] for i, ts_url in enumerate(segments): ts_path os.path.join(temp_dir, fseg_{i:05d}.ts) futures.append(executor.submit(self._download_ts, ts_url, ts_path)) ts_files.append(ts_path) for f in futures: f.result() with open(output_path, wb) as out: for ts in ts_files: if os.path.exists(ts): with open(ts, rb) as f: out.write(f.read()) for ts in ts_files: os.remove(ts) os.rmdir(temp_dir) return True def _download_ts(self, url, path): for _ in range(3): try: resp requests.get(url, headersself.headers, timeout30) if resp.status_code 200: with open(path, wb) as f: f.write(resp.content) return True except: time.sleep(1) return False五、批量采集调度器python# jd_batch_collector.py import os, json, time, threading, re from queue import Queue from dataclasses import dataclass, asdict from datetime import datetime dataclass class JDProductData: url: str; sku_id: str; title: str main_images: list; sku_images: list; detail_images: list; videos: list success: bool True; error: str None; timestamp: str None def __post_init__(self): if not self.timestamp: self.timestamp datetime.now().isoformat() class JDBatchCollector: def __init__(self, output_dir./downloads/jd): self.output_dir output_dir; self.queue Queue(); self.results [] self.lock threading.Lock(); self.completed_ids set() self.state_file jd_batch_state.json; self._load_state() def _load_state(self): if os.path.exists(self.state_file): try: with open(self.state_file, r) as f: self.completed_ids set(json.load(f).get(completed_ids, [])) print(f 加载断点: 已完成{len(self.completed_ids)}个商品) except: pass def _save_state(self): with self.lock: with open(self.state_file, w) as f: json.dump({completed_ids: list(self.completed_ids), last_update: datetime.now().isoformat()}, f, indent2) def add_urls(self, urls): for url in urls: sid re.search(r/(\d)\.html, url) sid sid.group(1) if sid else if sid and sid not in self.completed_ids: self.queue.put({url: url, sku_id: sid}) print(f 队列中有{self.queue.qsize()}个待处理商品) def collect_all(self, collector_func): print(f 开始批量采集) threads [threading.Thread(targetself._worker, args(collector_func,)) for _ in range(1)] for t in threads: t.start() for t in threads: t.join() self._save_results() return self.results def _worker(self, collector_func): while not self.queue.empty(): try: task self.queue.get(timeout1) print(f 采集商品: {task[sku_id]}) result collector_func(task[url]) result.sku_id task[sku_id] with self.lock: self.results.append(result) if result.success: self.completed_ids.add(task[sku_id]) self._save_state() success_count sum(1 for r in self.results if r.success) print(f 进度: {len(self.results)}个商品, 成功: {success_count}) time.sleep(2) except: pass def _save_results(self): file fjd_results_{datetime.now().strftime(%Y%m%d_%H%M%S)}.json with open(file, w, encodingutf-8) as f: json.dump([asdict(r) for r in self.results], f, ensure_asciiFalse, indent2)六、保存目录结构textdownloads/jd/ ├── 100012345678_2024新款女装/ │ ├── 主图/ │ │ ├── 主图_1.jpg │ │ ├── 主图_2.jpg │ │ └── 主图_3.jpg │ ├── SKU图/ │ │ ├── 红色.jpg │ │ ├── 蓝色.jpg │ │ └── 黑色.jpg │ ├── 详情图/ │ │ └── 详情图_1.jpg │ ├── 视频/ │ │ └── 视频.mp4 │ └── 商品信息.json └── jd_results_20250101_120000.json七、总结模块功能整店解析获取店铺所有商品链接SKU识别颜色/尺寸规格图自动分类视频下载mp4/m3u8格式支持批量调度队列断点续传核心要点基于Chromium浏览器内核下载的是京东的原图、原尺寸、原格式结论如果你需要一款稳定、自动分类、支持全平台的电商图片下载工具一键存图是目前最省心的选择。百度搜索“一键存图”或“火蚁一键存图”即可找到。