# Python爬取商品数据实现白酒销售系统开发指南
## 一、项目概述与系统架构设计
白酒销售系统是一个结合数据爬取、数据分析、库存管理和销售预测的综合性项目。该系统主要通过Python技术栈实现商品数据的自动化采集、处理和可视化展示,为白酒销售业务提供数据支撑和决策支持。
### 系统核心架构组件
| 模块名称 | 技术实现 | 功能描述 |
|---------|---------|---------|
| 数据采集层 | Requests/Scrapy/Selenium | 负责从电商平台爬取白酒商品数据 |
| 数据处理层 | Pandas/Numpy | 数据清洗、格式转换、特征工程 |
| 数据存储层 | MySQL/MongoDB | 结构化与非结构化数据存储 |
| 业务逻辑层 | Django/Flask | 销售管理、库存控制、用户管理 |
| 可视化展示 | Echarts/Plotly | 销售数据分析图表展示 |
## 二、数据爬取模块实现
### 2.1 电商平台数据爬取基础框架
```python
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import json
import re
class BaijiuSpider:
def __init__(self):
self.session = requests.Session()
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
}
def crawl_jd_baijiu(self, keyword="白酒", pages=5):
"""爬取京东白酒商品数据"""
products = []
for page in range(1, pages + 1):
url = f"https://search.jd.com/Search?keyword={keyword}&page={page}"
try:
response = self.session.get(url, headers=self.headers, timeout=10)
soup = BeautifulSoup(response.text, 'html.parser')
# 解析商品列表
items = soup.find_all('li', class_='gl-item')
for item in items:
product = self.parse_jd_product(item)
if product:
products.append(product)
time.sleep(2) # 防止请求过快
except Exception as e:
print(f"爬取第{page}页失败: {e}")
return pd.DataFrame(products)
def parse_jd_product(self, item):
"""解析京东商品信息"""
try:
# 商品名称
name_elem = item.find('div', class_='p-name')
name = name_elem.get_text(strip=True) if name_elem else ""
# 价格
price_elem = item.find('div', class_='p-price')
price = price_elem.get_text(strip=True) if price_elem else ""
# 店铺
shop_elem = item.find('div', class_='p-shop')
shop = shop_elem.get_text(strip=True) if shop_elem else ""
return {
'platform': '京东',
'product_name': name,
'price': self.clean_price(price),
'shop': shop,
'crawl_time': pd.Timestamp.now()
}
except Exception as e:
print(f"解析商品失败: {e}")
return None
def clean_price(self, price_text):
"""清洗价格数据"""
if price_text:
# 提取数字价格
match = re.search(r'¥?(\d+\.?\d*)', price_text)
if match:
return float(match.group(1))
return 0.0
# 使用示例
if __name__ == "__main__":
spider = BaijiuSpider()
df = spider.crawl_jd_baijiu(keyword="茅台", pages=3)
print(f"爬取到 {len(df)} 条白酒商品数据")
print(df.head())
```
### 2.2 多平台数据采集扩展
```python
class MultiPlatformSpider:
def __init__(self):
self.spiders = {
'jd': BaijiuSpider(),
'taobao': self.crawl_taobao,
'tmall': self.crawl_tmall
}
def crawl_all_platforms(self, keywords=["茅台", "五粮液", "泸州老窖"]):
"""多平台并行爬取"""
all_data = []
for keyword in keywords:
for platform, spider in self.spiders.items():
try:
if platform == 'jd':
data = spider.crawl_jd_baijiu(keyword, pages=2)
else:
data = spider(keyword)
data['keyword'] = keyword
all_data.append(data)
except Exception as e:
print(f"{platform}平台爬取{keyword}失败: {e}")
return pd.concat(all_data, ignore_index=True)
```
## 三、数据处理与存储模块
### 3.1 数据清洗与标准化
```python
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
class DataProcessor:
def __init__(self, db_url="sqlite:///baijiu.db"):
self.engine = create_engine(db_url)
def clean_product_data(self, df):
"""数据清洗处理"""
# 去除空值
df = df.dropna(subset=['product_name', 'price'])
# 价格异常值处理
df = df[(df['price'] > 0) & (df['price'] < 10000)]
# 品牌识别
df['brand'] = df['product_name'].apply(self.identify_brand)
# 规格提取
df['specification'] = df['product_name'].apply(self.extract_specification)
return df
def identify_brand(self, product_name):
"""识别白酒品牌"""
brands = ['茅台', '五粮液', '泸州老窖', '洋河', '汾酒', '古井贡酒', '郎酒']
for brand in brands:
if brand in product_name:
return brand
return '其他'
def extract_specification(self, product_name):
"""提取产品规格"""
# 匹配毫升数
ml_pattern = r'(\d+)ml|(\d+)ML|(\d+)毫升'
match = re.search(ml_pattern, product_name)
if match:
for group in match.groups():
if group:
return f"{group}ml"
return "未知"
def save_to_database(self, df, table_name='product_data'):
"""保存到数据库"""
df.to_sql(table_name, self.engine, if_exists='append', index=False)
print(f"数据已保存到{table_name}表,共{len(df)}条记录")
# 数据处理示例
processor = DataProcessor()
cleaned_df = processor.clean_product_data(df)
processor.save_to_database(cleaned_df)
```
### 3.2 数据分析与特征工程
```python
class DataAnalyzer:
def __init__(self, df):
self.df = df
def price_analysis(self):
"""价格分析"""
analysis = {
'total_products': len(self.df),
'avg_price': self.df['price'].mean(),
'max_price': self.df['price'].max(),
'min_price': self.df['price'].min(),
'price_std': self.df['price'].std()
}
# 品牌价格统计
brand_stats = self.df.groupby('brand')['price'].agg(['mean', 'count', 'std']).round(2)
return analysis, brand_stats
def generate_sales_report(self):
"""生成销售分析报告"""
# 价格区间分布
price_bins = [0, 100, 500, 1000, 5000, float('inf')]
price_labels = ['0-100', '100-500', '500-1000', '1000-5000', '5000+']
self.df['price_range'] = pd.cut(self.df['price'], bins=price_bins, labels=price_labels)
report = {
'price_distribution': self.df['price_range'].value_counts(),
'platform_distribution': self.df['platform'].value_counts(),
'top_brands': self.df['brand'].value_counts().head(10)
}
return report
# 分析示例
analyzer = DataAnalyzer(cleaned_df)
price_analysis, brand_stats = analyzer.price_analysis()
sales_report = analyzer.generate_sales_report()
```
## 四、Web系统开发实现
### 4.1 Flask Web应用框架
```python
from flask import Flask, render_template, request, jsonify
import pandas as pd
import json
app = Flask(__name__)
class BaijiuSalesSystem:
def __init__(self):
self.df = pd.DataFrame()
def load_data(self):
"""加载商品数据"""
try:
self.df = pd.read_sql('product_data', processor.engine)
return True
except:
return False
system = BaijiuSalesSystem()
system.load_data()
@app.route('/')
def index():
"""系统首页"""
return render_template('index.html')
@app.route('/api/products')
def get_products():
"""获取商品数据API"""
page = request.args.get('page', 1, type=int)
per_page = request.args.get('per_page', 20, type=int)
start = (page - 1) * per_page
end = start + per_page
products = system.df.iloc[start:end].to_dict('records')
total = len(system.df)
return jsonify({
'products': products,
'total': total,
'page': page,
'per_page': per_page,
'total_pages': (total + per_page - 1) // per_page
})
@app.route('/api/analysis')
def get_analysis():
"""获取数据分析API"""
analyzer = DataAnalyzer(system.df)
price_analysis, brand_stats = analyzer.price_analysis()
sales_report = analyzer.generate_sales_report()
return jsonify({
'price_analysis': price_analysis,
'brand_stats': brand_stats.to_dict(),
'sales_report': {
'price_distribution': sales_report['price_distribution'].to_dict(),
'platform_distribution': sales_report['platform_distribution'].to_dict(),
'top_brands': sales_report['top_brands'].to_dict()
}
})
@app.route('/dashboard')
def dashboard():
"""数据可视化仪表板"""
return render_template('dashboard.html')
if __name__ == '__main__':
app.run(debug=True, port=5000)
```
### 4.2 前端可视化界面
```html
<!DOCTYPE html>
<html>
<head>
<title>白酒销售数据分析系统</title>
<script src="https://cdn.jsdelivr.net/npm/echarts@5.4.3/dist/echarts.min.js"></script>
<style>
.dashboard-container {
display: grid;
grid-template-columns: 1fr 1fr;
gap: 20px;
padding: 20px;
}
.chart-container {
background: white;
border-radius: 8px;
padding: 20px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
</style>
</head>
<body>
<div class="dashboard-container">
<div class="chart-container">
<div id="priceChart" style="height: 400px;"></div>
</div>
<div class="chart-container">
<div id="brandChart" style="height: 400px;"></div>
</div>
</div>
<script>
// 初始化ECharts图表
function initCharts() {
// 价格分布图表
const priceChart = echarts.init(document.getElementById('priceChart'));
// 品牌分布图表
const brandChart = echarts.init(document.getElementById('brandChart'));
// 从API获取数据并渲染图表
fetch('/api/analysis')
.then(response => response.json())
.then(data => {
renderPriceChart(priceChart, data.sales_report.price_distribution);
renderBrandChart(brandChart, data.sales_report.top_brands);
});
}
function renderPriceChart(chart, priceData) {
const option = {
title: { text: '白酒价格分布' },
tooltip: { trigger: 'item' },
series: [{
name: '价格分布',
type: 'pie',
radius: '50%',
data: Object.entries(priceData).map(([name, value]) => ({name, value}))
}]
};
chart.setOption(option);
}
function renderBrandChart(chart, brandData) {
const option = {
title: { text: '品牌分布' },
tooltip: { trigger: 'axis' },
xAxis: {
type: 'category',
data: Object.keys(brandData)
},
yAxis: { type: 'value' },
series: [{
data: Object.values(brandData),
type: 'bar'
}]
};
chart.setOption(option);
}
// 页面加载完成后初始化图表
document.addEventListener('DOMContentLoaded', initCharts);
</script>
</body>
</html>
```
## 五、系统部署与优化
### 5.1 定时爬虫任务调度
```python
import schedule
import time
from datetime import datetime
def scheduled_crawling():
"""定时爬虫任务"""
print(f"{datetime.now()} - 开始执行定时爬虫任务")
spider = MultiPlatformSpider()
new_data = spider.crawl_all_platforms()
processor = DataProcessor()
cleaned_data = processor.clean_product_data(new_data)
processor.save_to_database(cleaned_data)
print(f"{datetime.now()} - 爬虫任务完成,新增{len(cleaned_data)}条数据")
# 设置定时任务
schedule.every().day.at("02:00").do(scheduled_crawling)
schedule.every().hour.do(lambda: print("系统运行中..."))
if __name__ == "__main__":
while True:
schedule.run_pending()
time.sleep(60)
```
### 5.2 性能优化建议
1. **爬虫优化**:使用Scrapy框架替代Requests,实现异步爬取提高效率
2. **数据库优化**:建立合适的索引,对品牌、价格等常用查询字段建立索引
3. **缓存策略**:对分析结果使用Redis进行缓存,减少数据库查询压力
4. **分布式部署**:使用Docker容器化部署,实现系统的弹性扩展
## 六、项目创新点与扩展方向
### 6.1 技术创新点
1. **多源数据融合**:整合多个电商平台数据,提供全面的市场洞察
2. **实时价格监控**:通过定时爬虫实现价格波动监控和预警
3. **智能品牌识别**:基于自然语言处理技术自动识别和分类白酒品牌
4. **可视化决策支持**:通过交互式图表为销售决策提供数据支持
### 6.2 业务扩展方向
1. **销售预测模型**:基于历史数据建立销量预测算法
2. **竞品分析模块**:深度分析竞争对手定价和促销策略
3. **库存优化系统**:基于销售预测实现智能库存管理
4. **移动端应用**:开发微信小程序或APP,实现移动办公
该系统完整实现了从数据采集到可视化展示的全流程,为白酒销售企业提供了强大的数据支撑平台。通过Python技术栈的灵活运用,不仅实现了功能需求,还具备了良好的扩展性和维护性[ref_1][ref_3][ref_4]。