Scraping Mixcloud DJ Sets and Track Metadata with Python (2026)
Scraping Mixcloud DJ Sets and Track Metadata with Python (2026)
Mixcloud is a music streaming platform focused on DJ mixes, radio shows, and long-form audio. Unlike Spotify or SoundCloud, it specializes in continuous mixes — which makes it a goldmine for anyone tracking DJ activity, music trends, or building a mix recommendation engine.
The good news: Mixcloud has a GraphQL API that powers their frontend. It's not officially documented for public use, but it's accessible and returns structured data. This guide covers how to use it to extract DJ profiles, set metadata, tracklists, and listener statistics.
Why Mixcloud Data is Valuable
Mixcloud's niche is long-form audio: radio shows, DJ mixes, podcast-style content. Data you can extract is useful for:
- Genre trend tracking — which electronic music sub-genres are gaining traction
- DJ analytics — listener counts, favorite counts, follower growth over time
- Tracklist databases — building a searchable index of which tracks appear in which mixes
- Recommendation engines — "fans of DJ X also listen to DJ Y" based on follower overlap
- Music label research — tracking which tracks labels are pushing to DJs before mainstream release
- Event booking intelligence — identifying rising DJs before they get expensive
Discovering the GraphQL API
Open any Mixcloud page and watch the Network tab. You'll see requests going to https://app.mixcloud.com/graphql. Every page load fires GraphQL queries for the data it needs — user profiles, cloudcasts (their term for uploads), tracklists, comments.
The API uses standard GraphQL — POST requests with a JSON body containing query and variables fields.
Setup
pip install httpx
No browser automation needed here. The GraphQL API responds to plain HTTP requests. httpx is enough.
Fetching a DJ's Profile
import httpx
import json
import time
GRAPHQL_URL = "https://app.mixcloud.com/graphql"
HEADERS = {
"Content-Type": "application/json",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 Chrome/126.0.0.0 Safari/537.36",
"Referer": "https://www.mixcloud.com/",
"Accept": "application/json",
"Origin": "https://www.mixcloud.com",
}
def fetch_user_profile(username: str) -> dict:
query = \"\"\"
query UserProfile($lookup: UserLookup!) {
user(lookup: $lookup) {
username
displayName
biog
city
country
followerCount
followingCount
cloudcastCount
listeningCount
isUploader
isVerified
picture {
url
}
website
createdTime
}
}
\"\"\"
variables = {"lookup": {"username": username}}
response = httpx.post(
GRAPHQL_URL,
json={"query": query, "variables": variables},
headers=HEADERS,
timeout=15
)
response.raise_for_status()
return response.json()["data"]["user"]
profile = fetch_user_profile("NTSRadio")
print(json.dumps(profile, indent=2))
This returns follower counts, upload counts, location, and bio — structured data, no HTML parsing needed.
Listing DJ Sets (Cloudcasts)
def fetch_cloudcasts(username: str, first: int = 20, after: str = None) -> dict:
query = \"\"\"
query UserCloudcasts(
$lookup: UserLookup!,
$first: Int!,
$after: String
) {
user(lookup: $lookup) {
uploads(first: $first, after: $after) {
edges {
node {
slug
name
publishDate
audioLength
listenerCount
favoriteCount
repostCount
isExclusive
widgetUrl
picture {
url
}
tags {
tag {
name
slug
}
}
categories {
slug
name
}
}
}
pageInfo {
hasNextPage
endCursor
}
totalCount
}
}
}
\"\"\"
variables = {
"lookup": {"username": username},
"first": first,
"after": after
}
response = httpx.post(
GRAPHQL_URL,
json={"query": query, "variables": variables},
headers=HEADERS,
timeout=15
)
response.raise_for_status()
data = response.json()["data"]["user"]["uploads"]
sets = []
for edge in data["edges"]:
node = edge["node"]
sets.append({
"name": node["name"],
"slug": node["slug"],
"date": node["publishDate"],
"duration_seconds": node["audioLength"],
"duration_minutes": round(node["audioLength"] / 60, 1),
"listeners": node.get("listenerCount", 0),
"favorites": node.get("favoriteCount", 0),
"reposts": node.get("repostCount", 0),
"is_exclusive": node.get("isExclusive", False),
"tags": [t["tag"]["name"] for t in (node.get("tags") or [])],
"categories": [c["name"] for c in (node.get("categories") or [])],
"widget_url": node.get("widgetUrl"),
})
return {
"sets": sets,
"has_next": data["pageInfo"]["hasNextPage"],
"cursor": data["pageInfo"]["endCursor"],
"total_count": data.get("totalCount"),
}
result = fetch_cloudcasts("NTSRadio", first=5)
for s in result["sets"]:
print(f"{s['name']} — {s['duration_minutes']}min, {s['listeners']} listeners, tags: {s['tags']}")
Extracting Tracklists
The tracklist for a mix is the most valuable data — it tells you which songs were played. Mixcloud requires uploaders to tag tracks (it's how they handle licensing), so most mixes have complete tracklists.
def fetch_tracklist(username: str, cloudcast_slug: str) -> list:
query = \"\"\"
query CloudcastTracklist(
$lookup: CloudcastLookup!
) {
cloudcast(lookup: $lookup) {
name
publishDate
listenerCount
sections {
artist {
name
slug
}
song {
name
slug
}
startSeconds
}
}
}
\"\"\"
variables = {
"lookup": {
"username": username,
"slug": cloudcast_slug
}
}
response = httpx.post(
GRAPHQL_URL,
json={"query": query, "variables": variables},
headers=HEADERS,
timeout=15
)
response.raise_for_status()
cloudcast = response.json()["data"]["cloudcast"]
tracks = []
for section in (cloudcast.get("sections") or []):
artist = section.get("artist", {}).get("name", "Unknown")
artist_slug = section.get("artist", {}).get("slug", "")
song = section.get("song", {}).get("name", "Unknown")
song_slug = section.get("song", {}).get("slug", "")
start = section.get("startSeconds", 0)
tracks.append({
"artist": artist,
"artist_slug": artist_slug,
"song": song,
"song_slug": song_slug,
"start_seconds": start,
"start_time": f"{start // 60}:{start % 60:02d}",
})
return {
"mix_name": cloudcast["name"],
"publish_date": cloudcast.get("publishDate"),
"listener_count": cloudcast.get("listenerCount"),
"tracks": tracks,
"track_count": len(tracks),
}
tracklist = fetch_tracklist("NTSRadio", "some-show-slug")
print(f"Mix: {tracklist['mix_name']} ({tracklist['track_count']} tracks)")
for t in tracklist["tracks"]:
print(f" {t['start_time']} — {t['artist']} - {t['song']}")
Pagination: Getting All Sets
Most DJs have hundreds of uploads. You need cursor-based pagination to get them all:
def fetch_all_cloudcasts(username: str, max_sets: int = None) -> list:
all_sets = []
cursor = None
page = 1
while True:
print(f"Fetching page {page}...")
result = fetch_cloudcasts(username, first=50, after=cursor)
all_sets.extend(result["sets"])
total = result.get("total_count")
if total:
print(f" Got {len(all_sets)}/{total}")
if not result["has_next"]:
break
if max_sets and len(all_sets) >= max_sets:
break
cursor = result["cursor"]
page += 1
time.sleep(2)
return all_sets[:max_sets] if max_sets else all_sets
all_sets = fetch_all_cloudcasts("Boaboradio", max_sets=200)
print(f"Total: {len(all_sets)} sets")
Fetching Follower Lists
For social graph analysis — tracking which DJs share an audience — you can fetch follower data:
def fetch_followers(username: str, first: int = 50, after: str = None) -> dict:
query = \"\"\"
query UserFollowers($lookup: UserLookup!, $first: Int!, $after: String) {
user(lookup: $lookup) {
followers(first: $first, after: $after) {
edges {
node {
username
displayName
followerCount
cloudcastCount
city
country
}
}
pageInfo {
hasNextPage
endCursor
}
}
}
}
\"\"\"
variables = {
"lookup": {"username": username},
"first": first,
"after": after,
}
response = httpx.post(
GRAPHQL_URL,
json={"query": query, "variables": variables},
headers=HEADERS,
timeout=15
)
response.raise_for_status()
data = response.json()["data"]["user"]["followers"]
followers = [edge["node"] for edge in data["edges"]]
return {
"followers": followers,
"has_next": data["pageInfo"]["hasNextPage"],
"cursor": data["pageInfo"]["endCursor"],
}
Rate Limits and Anti-Bot Protections
Mixcloud's GraphQL API is relatively permissive compared to most platforms, but it does have protections:
- Rate limiting: You'll start getting 429 responses if you exceed roughly 60 requests per minute. Keep it under that.
- IP blocking: Sustained heavy usage from a single IP will eventually get blocked. For large-scale collection (hundreds of DJs, full catalogs), rotate your IPs. ThorData's residential proxies are a good option here — rotating residential IPs prevent the consistent-IP pattern that triggers blocks.
- User-Agent validation: Send a realistic browser User-Agent. Empty or bot-like UAs get rejected.
- Referer header: Include
Referer: https://www.mixcloud.com/— the API checks for it. - Origin header: Include
Origin: https://www.mixcloud.comas well.
def create_session_with_proxy(proxy_url: str = None) -> httpx.Client:
proxies = proxy_url if proxy_url else None
return httpx.Client(
headers=HEADERS,
proxy=proxies,
timeout=15,
follow_redirects=True
)
def safe_graphql_request(client: httpx.Client, query: str, variables: dict, retries: int = 3) -> dict:
\"\"\"GraphQL request with retry logic.\"\"\"
for attempt in range(retries):
try:
response = client.post(
GRAPHQL_URL,
json={"query": query, "variables": variables},
)
if response.status_code == 429:
wait = 30 * (attempt + 1)
print(f"Rate limited. Waiting {wait}s...")
time.sleep(wait)
continue
response.raise_for_status()
return response.json()
except httpx.TimeoutException:
print(f"Timeout on attempt {attempt + 1}")
time.sleep(5)
raise Exception("Max retries exceeded")
# Without proxy (fine for small jobs)
client = create_session_with_proxy()
# With rotating proxy (for bulk scraping)
client = create_session_with_proxy("http://user:[email protected]:9000")
Building a Genre Trend Tracker
Here's a practical application — tracking which genres are trending on Mixcloud by analyzing tag frequency across recent uploads:
from collections import Counter
def genre_trends(usernames: list, sets_per_user: int = 20) -> dict:
all_tags = Counter()
total_sets = 0
for username in usernames:
try:
result = fetch_cloudcasts(username, first=sets_per_user)
for s in result["sets"]:
for tag in s["tags"]:
all_tags[tag.lower()] += 1
total_sets += 1
except Exception as e:
print(f"Error for {username}: {e}")
time.sleep(2)
return {
"top_genres": dict(all_tags.most_common(30)),
"total_sets_analyzed": total_sets,
"unique_tags": len(all_tags),
}
# Track trends across popular DJ accounts
djs = ["Boaboradio", "NTSRadio", "RinseFM", "fabriclondon", "FABRICLIVE", "BerghainKantine"]
trends = genre_trends(djs)
print(f"Analyzed {trends['total_sets_analyzed']} sets, found {trends['unique_tags']} unique tags")
for genre, count in list(trends["top_genres"].items())[:15]:
print(f" {genre}: {count}")
Storing Mix Data in SQLite
For building a searchable tracklist database:
import sqlite3
def init_mixcloud_db(db_path: str = "mixcloud.db") -> sqlite3.Connection:
conn = sqlite3.connect(db_path)
conn.execute(\"\"\"
CREATE TABLE IF NOT EXISTS djs (
username TEXT PRIMARY KEY,
display_name TEXT,
city TEXT,
country TEXT,
follower_count INTEGER,
cloudcast_count INTEGER,
scraped_at TEXT DEFAULT CURRENT_TIMESTAMP
)
\"\"\")
conn.execute(\"\"\"
CREATE TABLE IF NOT EXISTS cloudcasts (
slug TEXT,
username TEXT,
name TEXT,
publish_date TEXT,
duration_seconds INTEGER,
listener_count INTEGER,
favorite_count INTEGER,
repost_count INTEGER,
tags TEXT,
scraped_at TEXT DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (username, slug)
)
\"\"\")
conn.execute(\"\"\"
CREATE TABLE IF NOT EXISTS tracks (
id INTEGER PRIMARY KEY AUTOINCREMENT,
username TEXT,
cloudcast_slug TEXT,
artist TEXT,
artist_slug TEXT,
song TEXT,
song_slug TEXT,
start_seconds INTEGER,
FOREIGN KEY (username, cloudcast_slug) REFERENCES cloudcasts(username, slug)
)
\"\"\")
conn.execute("CREATE INDEX IF NOT EXISTS idx_tracks_artist ON tracks(artist)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_tracks_song ON tracks(song)")
conn.commit()
return conn
def save_mix_to_db(conn: sqlite3.Connection, username: str, tracklist_data: dict):
\"\"\"Save a full mix with tracklist to database.\"\"\"
slug = tracklist_data.get("slug", "")
conn.execute(
"INSERT OR REPLACE INTO cloudcasts (slug, username, name, publish_date, listener_count) VALUES (?,?,?,?,?)",
(slug, username, tracklist_data["mix_name"],
tracklist_data.get("publish_date"), tracklist_data.get("listener_count"))
)
for track in tracklist_data["tracks"]:
conn.execute(
"INSERT INTO tracks (username, cloudcast_slug, artist, artist_slug, song, song_slug, start_seconds) VALUES (?,?,?,?,?,?,?)",
(username, slug, track["artist"], track.get("artist_slug", ""),
track["song"], track.get("song_slug", ""), track["start_seconds"])
)
conn.commit()
def find_artist_appearances(conn: sqlite3.Connection, artist_name: str) -> list[dict]:
\"\"\"Find all mixes featuring a specific artist.\"\"\"
rows = conn.execute(\"\"\"
SELECT t.artist, t.song, t.start_time, c.name, c.username, c.publish_date
FROM tracks t
JOIN cloudcasts c ON t.username = c.username AND t.cloudcast_slug = c.slug
WHERE t.artist LIKE ?
ORDER BY c.publish_date DESC
\"\"\", (f"%{artist_name}%",)).fetchall()
return [
{"artist": r[0], "song": r[1], "start_time": r[2],
"mix_name": r[3], "dj": r[4], "date": r[5]}
for r in rows
]
GraphQL Schema Discovery
You can introspect the schema to discover available fields:
def introspect_type(type_name: str):
query = \"\"\"
query IntrospectType($name: String!) {
__type(name: $name) {
name
fields {
name
type {
name
kind
ofType {
name
kind
}
}
description
}
}
}
\"\"\"
response = httpx.post(
GRAPHQL_URL,
json={"query": query, "variables": {"name": type_name}},
headers=HEADERS,
timeout=15
)
return response.json()
# Discover what fields are available on User type
schema = introspect_type("User")
if schema.get("data", {}).get("__type"):
for field in schema["data"]["__type"]["fields"]:
field_type = field["type"].get("name") or field["type"].get("ofType", {}).get("name")
print(f" {field['name']}: {field_type}")
Introspection might be disabled — some deployments lock it down. But it's worth trying before you reverse-engineer queries from the frontend JavaScript.
Use Cases for Mixcloud Data
DJ career analytics: Track how a DJ's listener counts grow over time. A DJ with 2x listener growth in 3 months is someone to watch (or book before their rates go up).
Track popularity signals: Songs that appear in many mixes from respected DJs are likely to chart commercially. This is a leading indicator that labels and A&R use.
Label intelligence: Filter tracklists by record label to see how effectively a label is getting their releases into DJ sets. Higher placement rate = stronger label relationships.
Event programming: Cross-reference which DJs are getting the most play at which venues by scraping venue radio shows and resident DJ pages.
Playlist seeding: Extract tracklists from genre-defining mixes to seed a Spotify or Apple Music playlist with accurately categorized tracks.
Conclusion
Mixcloud's GraphQL API is one of the friendlier scraping targets. Structured data, cursor-based pagination, and relatively light anti-bot measures make it straightforward to build datasets of DJ activity, track plays, and genre trends. Keep your request rates reasonable, use proper headers, and you'll get far with just httpx and no browser automation. For bulk collection at scale, ThorData's residential proxy rotation handles the IP throttling without manual management.
Discovering Rising DJs
Track follower growth velocity to find DJs who are gaining traction:
import sqlite3
from datetime import datetime, date
def track_dj_growth(usernames: list[str], db_path: str = "mixcloud.db"):
"""Record follower counts for DJs over time to track growth."""
conn = init_mixcloud_db(db_path)
conn.execute("""
CREATE TABLE IF NOT EXISTS growth_snapshots (
username TEXT,
snapshot_date TEXT,
followers INTEGER,
cloudcast_count INTEGER,
PRIMARY KEY (username, snapshot_date)
)
""")
conn.commit()
today = date.today().isoformat()
for username in usernames:
try:
profile = fetch_user_profile(username)
conn.execute(
"INSERT OR REPLACE INTO growth_snapshots VALUES (?,?,?,?)",
(username, today, profile.get("followerCount"), profile.get("cloudcastCount"))
)
print(f"{username}: {profile.get('followerCount'):,} followers")
except Exception as e:
print(f"Error for {username}: {e}")
time.sleep(2)
conn.commit()
conn.close()
def find_fastest_growing(db_path: str = "mixcloud.db", min_snapshots: int = 7) -> list[dict]:
"""Find DJs with fastest follower growth rate."""
conn = sqlite3.connect(db_path)
rows = conn.execute("""
SELECT username,
MIN(followers) as min_followers,
MAX(followers) as max_followers,
COUNT(*) as snapshot_count,
MIN(snapshot_date) as first_date,
MAX(snapshot_date) as last_date
FROM growth_snapshots
GROUP BY username
HAVING snapshot_count >= ?
ORDER BY (MAX(followers) - MIN(followers)) * 1.0 / NULLIF(MIN(followers), 0) DESC
LIMIT 20
""", (min_snapshots,)).fetchall()
conn.close()
return [
{
"username": r[0],
"min_followers": r[1],
"max_followers": r[2],
"growth_pct": round((r[3] - r[2]) / max(r[2], 1) * 100, 1) if r[2] else 0,
"period": f"{r[4]} to {r[5]}",
}
for r in rows
]
Building a Track Popularity Index
Count how many times each track appears across all scraped mixes to build a popularity index:
def build_track_index(db_path: str = "mixcloud.db") -> pd.DataFrame:
"""Build a popularity index of tracks across all scraped mixes."""
conn = sqlite3.connect(db_path)
rows = conn.execute("""
SELECT t.artist, t.song,
COUNT(*) as appearance_count,
COUNT(DISTINCT t.username) as unique_djs,
MIN(c.publish_date) as first_seen,
MAX(c.publish_date) as last_seen
FROM tracks t
JOIN cloudcasts c ON t.username = c.username AND t.cloudcast_slug = c.slug
WHERE t.artist != 'Unknown' AND t.song != 'Unknown'
GROUP BY t.artist, t.song
HAVING appearance_count >= 2
ORDER BY appearance_count DESC
""").fetchall()
conn.close()
df = pd.DataFrame(rows, columns=["artist", "song", "appearances", "unique_djs", "first_seen", "last_seen"])
return df
track_df = build_track_index()
print("Most-played tracks in database:")
print(track_df.head(20).to_string(index=False))
Exporting Data for External Tools
Export scraped data to formats compatible with Spotify API, music analysis tools, or streaming platforms:
def export_tracklist_for_spotify_search(cloudcast_id: tuple, db_path: str = "mixcloud.db") -> list[dict]:
"""Export a mix's tracklist formatted for Spotify API search."""
conn = sqlite3.connect(db_path)
username, slug = cloudcast_id
tracks = conn.execute("""
SELECT artist, song, start_seconds
FROM tracks
WHERE username = ? AND cloudcast_slug = ?
ORDER BY start_seconds
""", (username, slug)).fetchall()
conn.close()
return [
{
"artist": t[0],
"track": t[1],
"start_time": f"{t[2] // 60}:{t[2] % 60:02d}",
"spotify_query": f"track:{t[1]} artist:{t[0]}",
}
for t in tracks
]
def export_genre_trends_csv(trends_data: dict, output_path: str):
"""Export genre trends to CSV for analysis."""
import csv
with open(output_path, "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=["genre", "count", "pct_of_total"])
writer.writeheader()
total = sum(trends_data["top_genres"].values())
for genre, count in trends_data["top_genres"].items():
writer.writerow({
"genre": genre,
"count": count,
"pct_of_total": round(count / total * 100, 2),
})
print(f"Exported to {output_path}")
Monitoring Radio Station Archives
NTS, Rinse FM, and Boiler Room all upload regularly. Automate monitoring for new content:
def monitor_new_uploads(station_username: str, db_path: str = "mixcloud.db", lookback_days: int = 7):
"""Check for new uploads from a station in the last N days."""
from datetime import datetime, timedelta, timezone
cutoff = datetime.now(timezone.utc) - timedelta(days=lookback_days)
result = fetch_cloudcasts(station_username, first=20)
new_uploads = []
for s in result["sets"]:
pub_date = s.get("date")
if pub_date:
try:
pub_dt = datetime.fromisoformat(pub_date.replace("Z", "+00:00"))
if pub_dt >= cutoff:
new_uploads.append(s)
except (ValueError, TypeError):
pass
return new_uploads
# Check NTS Radio for new shows this week
new_shows = monitor_new_uploads("NTSRadio", lookback_days=7)
print(f"NTS Radio: {len(new_shows)} new shows this week")
for show in new_shows:
print(f" {show['date'][:10]}: {show['name']} ({show['duration_minutes']}min, {show['listeners']} listeners)")