How to Build a China Supplier Database with Made-in-China.com and Yiwugo Data
Finding reliable suppliers in China is one of the biggest challenges for importers and e-commerce sellers. Instead of manually browsing platforms and copying data into spreadsheets, you can build an automated supplier database that pulls data from multiple sources, deduplicates entries, and ranks suppliers by quality.
In this guide, I'll walk through building a local supplier database using data from Made-in-China.com and Yiwugo.com — two of the largest B2B wholesale platforms in China.
Why Build a Supplier Database?
If you're sourcing products from China, you probably:
- Search the same platforms repeatedly for different products
- Lose track of suppliers you've already evaluated
- Have no systematic way to compare suppliers across platforms
A local database solves all of this. You search once, store everything, and query it whenever you need.
Architecture Overview
Here's what we're building:
Made-in-China.com ──→ Scraper ──→ Normalizer ──→ SQLite DB ──→ Query/Rank
Yiwugo.com ─────────→ Scraper ──→ Normalizer ──↗
The pipeline has four stages:
- Scrape product and supplier data from both platforms
- Normalize fields into a common schema
- Store in SQLite with deduplication
- Query and rank suppliers by a composite score
Step 1: Define the Database Schema
We need a schema that works for both platforms. Here's the SQLite setup:
import sqlite3
from datetime import datetime
def init_db(db_path="suppliers.db"):
conn = sqlite3.connect(db_path)
conn.execute("""
CREATE TABLE IF NOT EXISTS suppliers (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL,
platform TEXT NOT NULL,
location TEXT,
product_count INTEGER DEFAULT 0,
min_order_qty TEXT,
avg_price_usd REAL,
rating REAL,
years_in_business INTEGER,
verified INTEGER DEFAULT 0,
url TEXT,
last_updated TEXT,
UNIQUE(name, platform)
)
""")
conn.execute("""
CREATE TABLE IF NOT EXISTS products (
id INTEGER PRIMARY KEY AUTOINCREMENT,
supplier_id INTEGER,
title TEXT,
price TEXT,
moq TEXT,
category TEXT,
image_url TEXT,
product_url TEXT,
scraped_at TEXT,
FOREIGN KEY (supplier_id) REFERENCES suppliers(id)
)
""")
conn.execute("CREATE INDEX IF NOT EXISTS idx_supplier_name ON suppliers(name)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_product_category ON products(category)")
conn.commit()
return conn
Key design decisions:
-
UNIQUE(name, platform)prevents duplicates per platform - Separate
suppliersandproductstables for normalization -
verifiedflag tracks platform verification status
Step 2: Scrape Data with Apify
Instead of building scrapers from scratch, use pre-built Apify Actors:
from apify_client import ApifyClient
import os
client = ApifyClient(os.environ.get("APIFY_TOKEN"))
def scrape_made_in_china(keyword, max_items=50):
run = client.actor("jungle_intertwining/made-in-china-scraper").call(
run_input={
"keywords": [keyword],
"maxItems": max_items
}
)
return list(client.dataset(run["defaultDatasetId"]).iterate_items())
def scrape_yiwugo(keyword, max_items=50):
run = client.actor("jungle_intertwining/yiwugo-scraper").call(
run_input={
"keywords": [keyword],
"maxItems": max_items
}
)
return list(client.dataset(run["defaultDatasetId"]).iterate_items())
Run both scrapers for the same keyword to get cross-platform data:
keyword = "LED strip lights"
mic_data = scrape_made_in_china(keyword)
ywg_data = scrape_yiwugo(keyword)
print(f"Made-in-China: {len(mic_data)} products")
print(f"Yiwugo: {len(ywg_data)} products")
Step 3: Normalize and Insert Data
Each platform returns different field names. Normalize them before inserting:
import re
def parse_price_usd(price_str):
if not price_str:
return None
numbers = re.findall(r'[\d.]+', str(price_str))
if numbers:
return float(numbers[0])
return None
def insert_mic_data(conn, items):
for item in items:
supplier_name = item.get("supplier", "Unknown")
try:
conn.execute("""
INSERT OR REPLACE INTO suppliers
(name, platform, location, url, last_updated)
VALUES (?, 'made-in-china', ?, ?, ?)
""", (
supplier_name,
item.get("location", ""),
item.get("supplierUrl", ""),
datetime.now().isoformat()
))
supplier_id = conn.execute(
"SELECT id FROM suppliers WHERE name=? AND platform='made-in-china'",
(supplier_name,)
).fetchone()[0]
conn.execute("""
INSERT INTO products
(supplier_id, title, price, moq, category, image_url, product_url, scped_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
""", (
supplier_id,
item.get("title", ""),
item.get("price", ""),
item.get("moq", ""),
item.get("category", ""),
item.get("imageUrl", ""),
item.get("url", ""),
datetime.now().isoformat()
))
except Exception as e:
print(f"Error inserting {supplier_name}: {e}")
conn.commit()
def insert_ywg_data(conn, items):
for item in items:
supplier_name = item.get("shopName", "Unknown")
try:
conn.execute("""
INSERT OR REPLACE INTO suppliers
(name, platform, location, url, last_updated)
VALUES (?, 'yiwugo', ?, ?, ?)
""", (
supplier_name,
item.get("area", ""),
item.get("shopUrl", ""),
datetime.now().isoformat()
))
supplier_id = conn.execute(
"SELECT id FROM suppliers WHERE name=? AND platform='yiwugo'",
(supplier_name,)
).fetchone()[0]
conn.execute("""
INSERT INTO products
(supplier_id, title, price, moq, category, image_url, product_url, scraped_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
""", (
supplier_id,
item.get("title", ""),
item.get("price", ""),
item.get("minOrder", ""),
item.get("category", ""),
item.get("imageUrl", ""),
item.get("url", ""),
datetime.now().isoformat()
))
except Exception as e:
print(f"Error inserting {supplier_name}: {e}")
conn.commit()
Step 4: Deduplicate Across Platforms
Some suppliers sell on both platforms. Find them by fuzzy name matching:
from difflib import SequenceMatcher
def find_cross_platform_duplicates(conn, threshold=0.85):
mic = conn.execute(
"SELECT id, name FROM suppliers WHERE platform='made-in-china'"
).fetchall()
ywg = conn.execute(
"SELECT id, name FROM suppliers WHERE platform='yiwugo'"
).fetchall()
duplicates = []
for m_id, m_name in mic:
for y_id, y_name in ywg:
ratio = SequenceMatcher(None, m_name.lower(), y_name.lower()).ratio()
if ratio >= threshold:
duplicates.append({
"mic_id": m_id, "mic_name": m_name,
"ywg_id": y_id, "ywg_name": y_name,
"similarity": round(ratio, 3)
})
return duplicates
Cross-platform presence is actually a positive signal — it means the supplier is established enough to sell on multiple platforms.
Step 5: Score and Rank Suppliers
Build a composite score based on available data:
def score_suppliers(conn):
suppliers = conn.execute("""
SELECT s.id, s.name, s.platform, s.location, s.verified,
COUNT(p.id) as product_count,
AVG(CAST(REPLACE(REPLACE(p.price, '$', ''), ',', '') AS REAL)) as avg_price
FROM suppliers s
LEFT JOIN products p ON p.supplier_id = s.id
GROUP BY s.id
""").fetchall()
scored = []
for s in suppliers:
sid, name, platform, location, verified, prod_count, avg_price = s
score = 0
# More products = more established
score += min(prod_count * 2, 30)
# Verified suppliers get a bonus
if verified:
score += 20
# Cross-platform presence bonus
other = conn.execute(
"SELECT COUNT(*) FROM suppliers WHERE name=? AND platform!=?",
(name, platform)
).fetchone()[0]
if other > 0:
score += 15
# Known manufacturing hubs get a small bonus
hubs = ["yiwu", "shenzhen", "guangzhou", "dongguan", "ningbo"]
if location and any(h in location.lower() for h in hubs):
score += 5
scored.append({"id": sid, "name": name, "platform": platform,
"score": score, "products": prod_count})
return sorted(scored, key=lambda x: x["score"], reverse=True)
Step 6: Query Your Database
Now you can run queries like:
# Top 10 suppliers by score
top = score_suppliers(conn)[:10]
for s in top:
print(f"{s['score']:>3} pts | {s['name'][:40]:<40} | {s['platform']}")
# Find suppliers in a specific city
shenzhen = conn.execute(
"SELECT name, platform FROM suppliers WHERE location LIKE '%shenzhen%'"
).fetchall()
# Products under $5 with low MOQ
cheap = conn.execute("""
SELECT p.title, p.price, p.moq, s.name
FROM products p JOIN suppliers s ON p.supplier_id = s.id
WHERE CAST(REPLACE(REPLACE(p.price, '$', ''), ',', '') AS REAL) < 5
ORDER BY p.price ASC LIMIT 20
""").fetchall()
Putting It All Together
Here's the complete pipeline:
def build_supplier_database(keywords, db_path="suppliers.db"):
conn = init_db(db_path)
for keyword in keywords:
print(f"\n--- Scraping: {keyword} ---")
mic_data = scrape_made_in_china(keyword)
insert_mic_data(conn, mic_data)
print(f" Made-in-China: {len(mic_data)} products")
ywg_data = scrape_yiwugo(keyword)
insert_ywg_data(conn, ywg_data)
print(f" Yiwugo: {len(ywg_data)} products")
# Find cross-platform suppliers
dupes = find_cross_platform_duplicates(conn)
print(f"\nCross-platform suppliers found: {len(dupes)}")
# Score and rank
ranked = score_suppliers(conn)
print(f"\nTop 5 suppliers:")
for s in ranked[:5]:
print(f" {s['score']} pts - {s['name']} ({s['platform']})")
conn.close()
# Run it
build_supplier_database([
"LED strip lights",
"phone cases",
"yoga mats"
])
Scheduling Regular Updates
Set up a cron job to keep your database fresh:
# update_suppliers.py
import schedule
import time
def daily_update():
keywords = ["LED lights", "phone accessories", "fitness equipment"]
build_supplier_database(keywords)
schedule.every().day.at("06:00").do(daily_update)
while True:
schedule.run_pending()
time.sleep(60)
Or use Apify Schedules to run the scrapers automatically and pull results via webhook.
What You Can Build on Top
Once you have a supplier database, the possibilities open up:
- Price alerts: Track price changes over time, get notified when prices drop
- Supplier comparison reports: Generate PDF reports comparing suppliers for a specific product
- CRM integration: Push toliers into your procurement workflow
- Market trend analysis: Track which product categories are growing based on new listings
Tools Used
- Made-in-China.com Scraper — Extract B2B product and supplier data
- Yiwugo Scraper — Extract wholesale product data from Yiwu market
- DHgate Scraper — Extract wholesale prices and supplier info
- China Wholesale Scraper Toolkit — Cross-platform comparison scripts
Builier database is one of the highest-ROI things you can do for your sourcing workflow. Instead of starting from scratch every time you need a new product, you query your own data first.

