How to Scrape Behance Portfolio Data with Python (2026)
How to Scrape Behance Portfolio Data with Python (2026)
Behance is Adobe's portfolio platform, home to millions of creative projects across graphic design, illustration, photography, UI/UX, motion graphics, and more. It's a rich dataset for studying design trends, tracking creative talent, building inspiration tools, or analyzing which visual styles win the most engagement.
The platform offers a limited API through Adobe's developer program, but most useful data requires direct web scraping. This guide covers both approaches — the official API for what it can do, and direct scraping for everything else — with production-ready Python code.
Why Scrape Behance?
Behance is unique among creative platforms in that it combines professional quality with public accessibility:
- Design trend research: Track which visual styles, color palettes, and tools are gaining traction across different industries
- Talent discovery: Identify emerging designers before they become expensive to hire, based on engagement velocity
- Tool adoption tracking: Behance projects explicitly list which software was used — the only large-scale dataset tracking creative tool market share
- Portfolio analysis: Understand what makes portfolios succeed — which types of projects earn the most appreciations, which categories are saturated
- Competitive intelligence: Track what designers are doing for specific brands or in specific industries
- Training data: High-quality labeled creative work for design-aware ML models
Setup
pip install httpx beautifulsoup4 lxml sqlite3
# Optional: for async scraping
pip install asyncio aiohttp
Understanding the Adobe/Behance API
Behance's API lives under Adobe's developer platform. You need an Adobe Developer Console account and a project with the Behance API enabled. The API is free but rate-limited at approximately 150 requests per hour per API key.
import httpx
import time
import json
import random
import sqlite3
import re
from typing import Optional, Dict, List, Any
from datetime import datetime
from pathlib import Path
API_KEY = "your_adobe_api_key"
BASE_URL = "https://api.behance.net/v2"
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
"Accept": "application/json, text/html, */*",
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"DNT": "1",
}
class BehanceAPIClient:
"""Client for the official Behance/Adobe API."""
def __init__(self, api_key: str, rate_limit_delay: float = 1.5):
self.api_key = api_key
self.delay = rate_limit_delay
self.client = httpx.Client(headers=HEADERS, timeout=30)
self._request_count = 0
self._window_start = time.time()
def _rate_limit(self):
"""Enforce API rate limits (150/hour = ~2.5/min)."""
self._request_count += 1
# Reset counter every hour
if time.time() - self._window_start > 3600:
self._request_count = 0
self._window_start = time.time()
# Stay well under the 150/hour limit
if self._request_count > 140:
sleep_time = 3600 - (time.time() - self._window_start) + 60
print(f"[RATE LIMIT] Approaching API limit, sleeping {sleep_time:.0f}s")
time.sleep(sleep_time)
self._request_count = 0
self._window_start = time.time()
time.sleep(self.delay + random.uniform(0, 0.5))
def get(self, endpoint: str, params: Optional[Dict] = None) -> Optional[Dict]:
"""Make an authenticated API request with error handling."""
self._rate_limit()
request_params = {"client_id": self.api_key}
if params:
request_params.update(params)
try:
response = self.client.get(
f"{BASE_URL}/{endpoint}",
params=request_params,
)
if response.status_code == 200:
return response.json()
elif response.status_code == 429:
retry_after = int(response.headers.get("Retry-After", 60))
print(f"[429] Rate limited, waiting {retry_after}s")
time.sleep(retry_after)
return self.get(endpoint, params) # Retry once
elif response.status_code == 403:
print(f"[403] Forbidden — check API key validity")
return None
else:
print(f"[ERROR] HTTP {response.status_code} for {endpoint}")
return None
except httpx.TimeoutException:
print(f"[TIMEOUT] Request timed out for {endpoint}")
return None
except httpx.HTTPError as e:
print(f"[ERROR] HTTP error: {e}")
return None
api = BehanceAPIClient(API_KEY)
Searching Projects
The project search endpoint returns trending and curated creative work across all categories.
def search_projects(
client: BehanceAPIClient,
query: str,
sort: str = "appreciations",
page: int = 1,
per_page: int = 48,
field: Optional[str] = None,
country: Optional[str] = None,
) -> List[Dict]:
"""Search Behance projects with filtering options.
sort options: appreciations, views, comments, published_date
field: graphic-design, illustration, photography, ui-ux, etc.
"""
params = {
"q": query,
"sort": sort,
"page": page,
"per_page": min(per_page, 48),
}
if field:
params["field"] = field
if country:
params["country"] = country
data = client.get("projects", params)
if not data:
return []
projects = []
for p in data.get("projects", []):
projects.append({
"id": p["id"],
"name": p["name"],
"slug": p.get("slug", ""),
"url": p["url"],
"appreciations": p.get("stats", {}).get("appreciations", 0),
"views": p.get("stats", {}).get("views", 0),
"comments": p.get("stats", {}).get("comments", 0),
"fields": p.get("fields", []),
"owners": [
{
"name": o["display_name"],
"username": o["username"],
"url": o["url"],
}
for o in p.get("owners", [])
],
"published_on": p.get("published_on"),
"covers": p.get("covers", {}),
"mature_content": p.get("mature_content", 0),
})
return projects
def search_all_pages(
client: BehanceAPIClient,
query: str,
max_pages: int = 10,
sort: str = "appreciations",
) -> List[Dict]:
"""Paginate through search results up to max_pages."""
all_projects = []
for page in range(1, max_pages + 1):
print(f" Fetching page {page}/{max_pages}...")
projects = search_projects(client, query, sort=sort, page=page)
if not projects:
print(f" No more results at page {page}")
break
all_projects.extend(projects)
print(f" Page {page}: {len(projects)} projects (total: {len(all_projects)})")
return all_projects
# Example: Find top brand identity projects
projects = search_all_pages(api, "brand identity", max_pages=5, sort="appreciations")
print(f"\nTop 5 projects by appreciations:")
for p in sorted(projects, key=lambda x: x["appreciations"], reverse=True)[:5]:
print(f" {p['name'][:50]} — {p['appreciations']:,} appreciations")
print(f" By: {', '.join(o['name'] for o in p['owners'])}")
Scraping Project Details
The API's project detail endpoint gives you the full content — image modules, text blocks, color palette, tools used, and complete statistics.
def get_project_detail(
client: BehanceAPIClient,
project_id: int,
) -> Optional[Dict]:
"""Get full project details including all content modules."""
data = client.get(f"projects/{project_id}")
if not data or "project" not in data:
return None
project = data["project"]
modules = []
for m in project.get("modules", []):
mod = {
"type": m["type"],
"id": m.get("id"),
}
if m["type"] == "image":
mod["src"] = (
m.get("sizes", {}).get("original")
or m.get("sizes", {}).get("disp")
or m.get("src", "")
)
mod["width"] = m.get("width")
mod["height"] = m.get("height")
mod["caption"] = m.get("caption", "")
elif m["type"] == "text":
mod["text"] = m.get("text_plain") or re.sub(r"<[^>]+>", "", m.get("text", ""))
elif m["type"] == "embed":
mod["embed_url"] = m.get("original_url", "")
mod["src"] = m.get("src", "")
elif m["type"] == "media_collection":
mod["components"] = [
{"src": c.get("sizes", {}).get("original", ""), "width": c.get("width")}
for c in m.get("components", [])
]
modules.append(mod)
# Process tags — they can be strings or dicts
tags = []
for t in project.get("tags", []):
if isinstance(t, dict):
tags.append(t.get("title") or t.get("name", ""))
elif isinstance(t, str):
tags.append(t)
# Convert color tuples to hex
colors = []
for c in project.get("colors", []):
if isinstance(c, dict):
r, g, b = c.get("r", 0), c.get("g", 0), c.get("b", 0)
colors.append(f"#{r:02x}{g:02x}{b:02x}")
return {
"id": project["id"],
"name": project["name"],
"slug": project.get("slug", ""),
"url": project["url"],
"description": project.get("description", "")[:1000],
"fields": project.get("fields", []),
"tags": [t for t in tags if t],
"tools": [t.get("title", "") for t in project.get("tools", [])],
"colors": colors,
"stats": project.get("stats", {}),
"modules": modules,
"module_count": len(modules),
"image_count": sum(1 for m in modules if m["type"] == "image"),
"owners": [
{
"name": o["display_name"],
"username": o["username"],
"location": o.get("location", ""),
"url": o["url"],
}
for o in project.get("owners", [])
],
"published_on": project.get("published_on"),
"license": project.get("license", {}).get("license", ""),
"scraped_at": datetime.utcnow().isoformat(),
}
Designer Profile Scraping
Get comprehensive data about creative professionals including their stats, project portfolio, and work history.
def get_user_profile(
client: BehanceAPIClient,
username: str,
) -> Optional[Dict]:
"""Get a Behance user's complete profile."""
data = client.get(f"users/{username}")
if not data or "user" not in data:
return None
user = data["user"]
return {
"id": user["id"],
"username": user["username"],
"display_name": user["display_name"],
"location": user.get("location", ""),
"occupation": user.get("occupation", ""),
"company": user.get("company", ""),
"url": user["url"],
"website": user.get("website", ""),
"twitter": user.get("social_links", {}).get("twitter", ""),
"linkedin": user.get("social_links", {}).get("linkedin", ""),
"stats": user.get("stats", {}),
"fields": user.get("fields", []),
"tools": [t.get("title", "") for t in user.get("tools", [])],
"created_on": user.get("created_on"),
"sections": list(user.get("sections", {}).keys()),
"scraped_at": datetime.utcnow().isoformat(),
}
def get_user_projects(
client: BehanceAPIClient,
username: str,
max_pages: int = 5,
) -> List[Dict]:
"""Get all projects by a specific user, paginated."""
all_projects = []
for page in range(1, max_pages + 1):
data = client.get(f"users/{username}/projects", {"page": page, "per_page": 24})
if not data:
break
projects = data.get("projects", [])
if not projects:
break
for p in projects:
all_projects.append({
"id": p["id"],
"name": p["name"],
"url": p["url"],
"appreciations": p.get("stats", {}).get("appreciations", 0),
"views": p.get("stats", {}).get("views", 0),
"comments": p.get("stats", {}).get("comments", 0),
"fields": p.get("fields", []),
"published_on": p.get("published_on"),
})
return all_projects
def get_user_appreciations(
client: BehanceAPIClient,
username: str,
) -> List[Dict]:
"""Get projects the user has appreciated (liked)."""
data = client.get(f"users/{username}/appreciations")
if not data:
return []
return [
{
"project_id": a.get("project", {}).get("id"),
"project_name": a.get("project", {}).get("name"),
"project_url": a.get("project", {}).get("url"),
"appreciated_on": a.get("timestamp"),
}
for a in data.get("appreciations", [])
]
Web Scraping for Trending Data
The API doesn't expose Behance's trending and curated feeds well. For that, you need to scrape the web interface. Behance embeds JSON data in script tags that you can extract directly.
from bs4 import BeautifulSoup
import re
def scrape_trending_projects(
page: int = 1,
field: Optional[str] = None,
proxy: Optional[str] = None,
) -> List[Dict]:
"""Scrape Behance's trending projects page via HTML."""
url = "https://www.behance.net/search/projects"
params = {
"tracking_source": "search_projects_recommended",
"sort": "recommended",
"time": "month",
"ordinal": (page - 1) * 48,
}
if field:
params["field"] = field
headers = {
**HEADERS,
"Referer": "https://www.behance.net/",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "same-origin",
}
proxies = {"http": proxy, "https": proxy} if proxy else None
try:
with httpx.Client(headers=headers, timeout=30) as client:
if proxy:
client = httpx.Client(
headers=headers,
proxies={"http://": proxy, "https://": proxy},
timeout=30,
)
response = client.get(url, params=params)
except httpx.HTTPError as e:
print(f"[ERROR] {e}")
return []
# Behance embeds multiple JSON blobs in script tags
# Try to find the Next.js page props
match = re.search(
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
response.text,
re.DOTALL,
)
if not match:
# Fallback: try parsing HTML cards directly
return scrape_project_cards_from_html(response.text)
try:
data = json.loads(match.group(1))
except json.JSONDecodeError:
return scrape_project_cards_from_html(response.text)
# Navigate the Next.js data structure
page_props = data.get("props", {}).get("pageProps", {})
search_data = page_props.get("searchData", {}) or page_props.get("initialData", {})
projects = []
for item in search_data.get("projects", []):
projects.append({
"id": item.get("id"),
"name": item.get("name"),
"slug": item.get("slug", ""),
"url": f"https://www.behance.net/gallery/{item.get('id')}/{item.get('slug', '')}",
"views": item.get("stats", {}).get("views", 0),
"appreciations": item.get("stats", {}).get("appreciations", 0),
"fields": item.get("fields", []),
"owners": [o.get("display_name", "") for o in item.get("owners", [])],
"covers": item.get("covers", {}),
})
return projects
def scrape_project_cards_from_html(html: str) -> List[Dict]:
"""Fallback: parse project cards from HTML when JSON isn't available."""
soup = BeautifulSoup(html, "lxml")
projects = []
for card in soup.select('[class*="ProjectCover"]'):
title_el = card.select_one('[class*="title"]')
link_el = card.select_one("a[href*='/gallery/']")
owner_el = card.select_one('[class*="owner"]')
if not link_el:
continue
href = link_el.get("href", "")
projects.append({
"name": title_el.text.strip() if title_el else "",
"url": f"https://www.behance.net{href}" if href.startswith("/") else href,
"owner": owner_el.text.strip() if owner_el else "",
})
return projects
Anti-Bot Measures and Proxy Integration
Behance/Adobe has layered protections you'll encounter at scale:
Cloudflare Protection: The web interface sits behind Cloudflare, which blocks automated requests based on TLS fingerprinting, header analysis, and behavioral patterns. Standard Python httpx requests often get challenged.
API Rate Limiting: The Behance API limits requests to ~150 per hour per key. Exceed that and you'll get 429 responses with a Retry-After header.
JavaScript Rendering: Behance heavily relies on client-side rendering. Much content loads via JavaScript after initial page load.
Geographic Restrictions: Some designer portfolios have regional visibility settings.
For collecting data at scale — indexing thousands of portfolios or building a design trend database — you need both proxy rotation and browser-like request patterns.
ThorData's residential proxy network handles Cloudflare effectively since residential IPs with proper TLS fingerprints pass Cloudflare's checks where datacenter IPs get blocked.
class ThorDataProxyPool:
"""Rotating residential proxy pool using ThorData."""
def __init__(self, username: str, password: str):
self.username = username
self.password = password
self.host = "gate.thordata.com"
self.port = 9000
def get_proxy(self, country: Optional[str] = None, session_id: Optional[str] = None) -> str:
user = self.username
if country:
user = f"{self.username}-country-{country.upper()}"
if session_id:
user = f"{user}-session-{session_id}"
return f"http://{user}:{self.password}@{self.host}:{self.port}"
def get_rotating(self) -> str:
"""Fresh IP for each request."""
return self.get_proxy()
def get_sticky(self, session_id: str) -> str:
"""Same IP for duration of browsing session (2-10 min)."""
return self.get_proxy(session_id=session_id)
def scrape_designer_with_proxy(
client: BehanceAPIClient,
username: str,
proxy_pool: ThorDataProxyPool,
) -> Dict:
"""Scrape a designer's full profile with proxy rotation."""
proxy = proxy_pool.get_rotating()
# API calls (via the client)
profile = get_user_profile(client, username)
if not profile:
return {}
projects = get_user_projects(client, username)
# Web scraping for data the API doesn't expose
session_proxy = proxy_pool.get_sticky(f"designer-{username[:10]}")
web_data = scrape_designer_web_profile(username, proxy=session_proxy)
profile["projects"] = projects
profile["web_enrichment"] = web_data
return profile
def scrape_designer_web_profile(username: str, proxy: Optional[str] = None) -> Dict:
"""Scrape additional profile data from Behance web (follower counts, etc)."""
url = f"https://www.behance.net/{username}"
headers = {
**HEADERS,
"Referer": "https://www.behance.net/",
}
proxies = {"http": proxy, "https": proxy} if proxy else None
try:
resp = httpx.get(url, headers=headers, follow_redirects=True, timeout=30)
except httpx.HTTPError as e:
return {"error": str(e)}
# Extract NEXT_DATA JSON
match = re.search(r'<script id="__NEXT_DATA__"[^>]*>(.*?)</script>', resp.text, re.DOTALL)
if match:
try:
data = json.loads(match.group(1))
user_data = (
data.get("props", {})
.get("pageProps", {})
.get("profile", {})
)
if user_data:
return {
"followers": user_data.get("stats", {}).get("followers", 0),
"following": user_data.get("stats", {}).get("following", 0),
"project_views": user_data.get("stats", {}).get("project_views", 0),
"appreciations": user_data.get("stats", {}).get("appreciations", 0),
}
except json.JSONDecodeError:
pass
# HTML fallback
soup = BeautifulSoup(resp.text, "lxml")
stats = {}
for stat_el in soup.select('[class*="UserStats"]'):
label = stat_el.select_one('[class*="label"]')
value = stat_el.select_one('[class*="count"]')
if label and value:
stats[label.text.strip().lower()] = value.text.strip()
return stats
Data Storage
SQLite is the right storage layer for Behance data at portfolio scale.
def init_database(db_path: str = "behance.db") -> sqlite3.Connection:
"""Initialize the Behance data database."""
conn = sqlite3.connect(db_path)
conn.execute("PRAGMA journal_mode=WAL")
conn.executescript("""
CREATE TABLE IF NOT EXISTS projects (
id INTEGER PRIMARY KEY,
name TEXT,
slug TEXT,
url TEXT UNIQUE,
description TEXT,
appreciations INTEGER DEFAULT 0,
views INTEGER DEFAULT 0,
comments INTEGER DEFAULT 0,
fields TEXT, -- JSON array
tags TEXT, -- JSON array
tools TEXT, -- JSON array
colors TEXT, -- JSON array of hex strings
module_count INTEGER,
image_count INTEGER,
license TEXT,
published_on INTEGER, -- Unix timestamp
scraped_at TEXT
);
CREATE TABLE IF NOT EXISTS project_owners (
project_id INTEGER,
username TEXT,
display_name TEXT,
location TEXT,
url TEXT,
PRIMARY KEY (project_id, username),
FOREIGN KEY (project_id) REFERENCES projects(id)
);
CREATE TABLE IF NOT EXISTS designers (
username TEXT PRIMARY KEY,
display_name TEXT,
location TEXT,
occupation TEXT,
company TEXT,
url TEXT,
website TEXT,
fields TEXT, -- JSON array
tools TEXT, -- JSON array
followers INTEGER,
following INTEGER,
appreciations INTEGER,
project_views INTEGER,
project_count INTEGER,
scraped_at TEXT
);
CREATE TABLE IF NOT EXISTS trending_snapshots (
id INTEGER PRIMARY KEY AUTOINCREMENT,
project_id INTEGER,
project_name TEXT,
appreciations INTEGER,
views INTEGER,
field TEXT,
snapshot_date TEXT,
FOREIGN KEY (project_id) REFERENCES projects(id)
);
CREATE INDEX IF NOT EXISTS idx_projects_appreciations ON projects(appreciations DESC);
CREATE INDEX IF NOT EXISTS idx_projects_field ON projects(fields);
CREATE INDEX IF NOT EXISTS idx_trending_date ON trending_snapshots(snapshot_date);
""")
conn.commit()
return conn
def save_project(conn: sqlite3.Connection, project: Dict):
"""Save a project to the database."""
conn.execute(
"""INSERT OR REPLACE INTO projects
(id, name, slug, url, description, appreciations, views, comments,
fields, tags, tools, colors, module_count, image_count, license, published_on, scraped_at)
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""",
(
project["id"],
project["name"],
project.get("slug", ""),
project["url"],
project.get("description", ""),
project.get("stats", {}).get("appreciations", project.get("appreciations", 0)),
project.get("stats", {}).get("views", project.get("views", 0)),
project.get("stats", {}).get("comments", project.get("comments", 0)),
json.dumps(project.get("fields", [])),
json.dumps(project.get("tags", [])),
json.dumps(project.get("tools", [])),
json.dumps(project.get("colors", [])),
project.get("module_count"),
project.get("image_count"),
project.get("license", ""),
project.get("published_on"),
project.get("scraped_at", datetime.utcnow().isoformat()),
)
)
for owner in project.get("owners", []):
conn.execute(
"""INSERT OR REPLACE INTO project_owners (project_id, username, display_name, location, url)
VALUES (?,?,?,?,?)""",
(project["id"], owner.get("username"), owner.get("name"), owner.get("location"), owner.get("url"))
)
conn.commit()
def query_top_tools(conn: sqlite3.Connection, limit: int = 20) -> List[Dict]:
"""Analyze most-used tools across all scraped projects."""
rows = conn.execute("SELECT tools FROM projects WHERE tools != '[]'").fetchall()
tool_counts = {}
for (tools_json,) in rows:
for tool in json.loads(tools_json):
tool_counts[tool] = tool_counts.get(tool, 0) + 1
return sorted(
[{"tool": k, "count": v} for k, v in tool_counts.items()],
key=lambda x: x["count"],
reverse=True,
)[:limit]
Complete Production Pipeline
def run_portfolio_scraper(
search_queries: List[str],
db_path: str = "behance.db",
api_key: str = "",
proxy_pool: Optional[ThorDataProxyPool] = None,
max_pages_per_query: int = 5,
) -> Dict:
"""Full pipeline: search → project details → database."""
conn = init_database(db_path)
client = BehanceAPIClient(api_key)
stats = {
"projects_found": 0,
"projects_saved": 0,
"designers_scraped": 0,
"errors": 0,
}
for query in search_queries:
print(f"\n[QUERY] {query}")
proxy = proxy_pool.get_rotating() if proxy_pool else None
# Search for projects
projects = search_all_pages(client, query, max_pages=max_pages_per_query)
stats["projects_found"] += len(projects)
print(f" Found {len(projects)} projects")
for project in projects:
# Check if already have full details
existing = conn.execute(
"SELECT id FROM projects WHERE id = ? AND module_count IS NOT NULL",
(project["id"],)
).fetchone()
if existing:
continue
# Get full project details
detail = get_project_detail(client, project["id"])
if detail:
save_project(conn, detail)
stats["projects_saved"] += 1
# Scrape trending for this field
proxy = proxy_pool.get_rotating() if proxy_pool else None
trending = scrape_trending_projects(page=1, proxy=proxy)
if trending:
snapshot_date = datetime.utcnow().date().isoformat()
for item in trending:
conn.execute(
"INSERT INTO trending_snapshots (project_id, project_name, appreciations, views, snapshot_date) VALUES (?,?,?,?,?)",
(item.get("id"), item.get("name"), item.get("appreciations", 0), item.get("views", 0), snapshot_date)
)
conn.commit()
conn.close()
print(f"\nPipeline complete: {stats}")
return stats
# Example usage
if __name__ == "__main__":
queries = [
"brand identity 2026",
"ui ux design system",
"motion graphics branding",
"illustration editorial",
]
# pool = ThorDataProxyPool("YOUR_USER", "YOUR_PASS")
# run_portfolio_scraper(queries, api_key="YOUR_API_KEY", proxy_pool=pool)
run_portfolio_scraper(queries, api_key="YOUR_API_KEY")
Real-World Use Cases
Design Tool Market Share Tracker: Scrape project tools data monthly across all categories. Track Figma vs Sketch vs Adobe XD adoption, monitor Blender's rise in 3D/motion work, spot emerging tools before they go mainstream.
Hiring Pipeline Enricher: Given a list of candidate names, find their Behance profiles, pull project stats and skills. Automatically rank candidates by portfolio quality metrics (appreciations, views, tool diversity).
Trend Alert System: Run nightly trending scrapes across design fields. Compare today's trending projects to last week's. Alert when a new visual style or technique appears in multiple trending projects — often signals a coming mainstream trend.
Competitor Portfolio Analysis: Track all Behance-active designers at competitor agencies. Monitor their new project releases, engagement metrics, and client types. Get early warning when competitors shift strategy.
Creative Training Dataset: Build a labeled dataset of high-quality creative work by scraping top projects with their field labels, tool annotations, and color palettes. Useful for fine-tuning vision models on design-specific content.
Use ThorData residential proxies for large-scale portfolio collection — Cloudflare protection on Behance's web interface makes rotating residential IPs essential for anything beyond small-scale API usage.