Scraping Mixcloud DJ Sets and Track Metadata with Python (2026)

2026-04-09 [python mixcloud scraping graphql music]

Scraping Mixcloud DJ Sets and Track Metadata with Python (2026)

Mixcloud is a music streaming platform focused on DJ mixes, radio shows, and long-form audio. Unlike Spotify or SoundCloud, it specializes in continuous mixes — which makes it a goldmine for anyone tracking DJ activity, music trends, or building a mix recommendation engine.

The good news: Mixcloud has a GraphQL API that powers their frontend. It's not officially documented for public use, but it's accessible and returns structured data. This guide covers how to use it to extract DJ profiles, set metadata, tracklists, and listener statistics.

Why Mixcloud Data is Valuable

Mixcloud's niche is long-form audio: radio shows, DJ mixes, podcast-style content. Data you can extract is useful for:

Genre trend tracking — which electronic music sub-genres are gaining traction
DJ analytics — listener counts, favorite counts, follower growth over time
Tracklist databases — building a searchable index of which tracks appear in which mixes
Recommendation engines — "fans of DJ X also listen to DJ Y" based on follower overlap
Music label research — tracking which tracks labels are pushing to DJs before mainstream release
Event booking intelligence — identifying rising DJs before they get expensive

Discovering the GraphQL API

Open any Mixcloud page and watch the Network tab. You'll see requests going to https://app.mixcloud.com/graphql. Every page load fires GraphQL queries for the data it needs — user profiles, cloudcasts (their term for uploads), tracklists, comments.

The API uses standard GraphQL — POST requests with a JSON body containing query and variables fields.

Setup

pip install httpx

No browser automation needed here. The GraphQL API responds to plain HTTP requests. httpx is enough.

Fetching a DJ's Profile

import httpx
import json
import time

GRAPHQL_URL = "https://app.mixcloud.com/graphql"

HEADERS = {
    "Content-Type": "application/json",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 Chrome/126.0.0.0 Safari/537.36",
    "Referer": "https://www.mixcloud.com/",
    "Accept": "application/json",
    "Origin": "https://www.mixcloud.com",
}

def fetch_user_profile(username: str) -> dict:
    query = \"\"\"
    query UserProfile($lookup: UserLookup!) {
        user(lookup: $lookup) {
            username
            displayName
            biog
            city
            country
            followerCount
            followingCount
            cloudcastCount
            listeningCount
            isUploader
            isVerified
            picture {
                url
            }
            website
            createdTime
        }
    }
    \"\"\"
    variables = {"lookup": {"username": username}}

    response = httpx.post(
        GRAPHQL_URL,
        json={"query": query, "variables": variables},
        headers=HEADERS,
        timeout=15
    )
    response.raise_for_status()
    return response.json()["data"]["user"]


profile = fetch_user_profile("NTSRadio")
print(json.dumps(profile, indent=2))

This returns follower counts, upload counts, location, and bio — structured data, no HTML parsing needed.

Listing DJ Sets (Cloudcasts)

def fetch_cloudcasts(username: str, first: int = 20, after: str = None) -> dict:
    query = \"\"\"
    query UserCloudcasts(
        $lookup: UserLookup!,
        $first: Int!,
        $after: String
    ) {
        user(lookup: $lookup) {
            uploads(first: $first, after: $after) {
                edges {
                    node {
                        slug
                        name
                        publishDate
                        audioLength
                        listenerCount
                        favoriteCount
                        repostCount
                        isExclusive
                        widgetUrl
                        picture {
                            url
                        }
                        tags {
                            tag {
                                name
                                slug
                            }
                        }
                        categories {
                            slug
                            name
                        }
                    }
                }
                pageInfo {
                    hasNextPage
                    endCursor
                }
                totalCount
            }
        }
    }
    \"\"\"
    variables = {
        "lookup": {"username": username},
        "first": first,
        "after": after
    }

    response = httpx.post(
        GRAPHQL_URL,
        json={"query": query, "variables": variables},
        headers=HEADERS,
        timeout=15
    )
    response.raise_for_status()
    data = response.json()["data"]["user"]["uploads"]

    sets = []
    for edge in data["edges"]:
        node = edge["node"]
        sets.append({
            "name": node["name"],
            "slug": node["slug"],
            "date": node["publishDate"],
            "duration_seconds": node["audioLength"],
            "duration_minutes": round(node["audioLength"] / 60, 1),
            "listeners": node.get("listenerCount", 0),
            "favorites": node.get("favoriteCount", 0),
            "reposts": node.get("repostCount", 0),
            "is_exclusive": node.get("isExclusive", False),
            "tags": [t["tag"]["name"] for t in (node.get("tags") or [])],
            "categories": [c["name"] for c in (node.get("categories") or [])],
            "widget_url": node.get("widgetUrl"),
        })

    return {
        "sets": sets,
        "has_next": data["pageInfo"]["hasNextPage"],
        "cursor": data["pageInfo"]["endCursor"],
        "total_count": data.get("totalCount"),
    }


result = fetch_cloudcasts("NTSRadio", first=5)
for s in result["sets"]:
    print(f"{s['name']} — {s['duration_minutes']}min, {s['listeners']} listeners, tags: {s['tags']}")

Extracting Tracklists

The tracklist for a mix is the most valuable data — it tells you which songs were played. Mixcloud requires uploaders to tag tracks (it's how they handle licensing), so most mixes have complete tracklists.

def fetch_tracklist(username: str, cloudcast_slug: str) -> list:
    query = \"\"\"
    query CloudcastTracklist(
        $lookup: CloudcastLookup!
    ) {
        cloudcast(lookup: $lookup) {
            name
            publishDate
            listenerCount
            sections {
                artist {
                    name
                    slug
                }
                song {
                    name
                    slug
                }
                startSeconds
            }
        }
    }
    \"\"\"
    variables = {
        "lookup": {
            "username": username,
            "slug": cloudcast_slug
        }
    }

    response = httpx.post(
        GRAPHQL_URL,
        json={"query": query, "variables": variables},
        headers=HEADERS,
        timeout=15
    )
    response.raise_for_status()
    cloudcast = response.json()["data"]["cloudcast"]

    tracks = []
    for section in (cloudcast.get("sections") or []):
        artist = section.get("artist", {}).get("name", "Unknown")
        artist_slug = section.get("artist", {}).get("slug", "")
        song = section.get("song", {}).get("name", "Unknown")
        song_slug = section.get("song", {}).get("slug", "")
        start = section.get("startSeconds", 0)
        tracks.append({
            "artist": artist,
            "artist_slug": artist_slug,
            "song": song,
            "song_slug": song_slug,
            "start_seconds": start,
            "start_time": f"{start // 60}:{start % 60:02d}",
        })

    return {
        "mix_name": cloudcast["name"],
        "publish_date": cloudcast.get("publishDate"),
        "listener_count": cloudcast.get("listenerCount"),
        "tracks": tracks,
        "track_count": len(tracks),
    }


tracklist = fetch_tracklist("NTSRadio", "some-show-slug")
print(f"Mix: {tracklist['mix_name']} ({tracklist['track_count']} tracks)")
for t in tracklist["tracks"]:
    print(f"  {t['start_time']} — {t['artist']} - {t['song']}")

Pagination: Getting All Sets

Most DJs have hundreds of uploads. You need cursor-based pagination to get them all:

def fetch_all_cloudcasts(username: str, max_sets: int = None) -> list:
    all_sets = []
    cursor = None
    page = 1

    while True:
        print(f"Fetching page {page}...")
        result = fetch_cloudcasts(username, first=50, after=cursor)
        all_sets.extend(result["sets"])

        total = result.get("total_count")
        if total:
            print(f"  Got {len(all_sets)}/{total}")

        if not result["has_next"]:
            break

        if max_sets and len(all_sets) >= max_sets:
            break

        cursor = result["cursor"]
        page += 1
        time.sleep(2)

    return all_sets[:max_sets] if max_sets else all_sets


all_sets = fetch_all_cloudcasts("Boaboradio", max_sets=200)
print(f"Total: {len(all_sets)} sets")

Fetching Follower Lists

For social graph analysis — tracking which DJs share an audience — you can fetch follower data:

def fetch_followers(username: str, first: int = 50, after: str = None) -> dict:
    query = \"\"\"
    query UserFollowers($lookup: UserLookup!, $first: Int!, $after: String) {
        user(lookup: $lookup) {
            followers(first: $first, after: $after) {
                edges {
                    node {
                        username
                        displayName
                        followerCount
                        cloudcastCount
                        city
                        country
                    }
                }
                pageInfo {
                    hasNextPage
                    endCursor
                }
            }
        }
    }
    \"\"\"
    variables = {
        "lookup": {"username": username},
        "first": first,
        "after": after,
    }

    response = httpx.post(
        GRAPHQL_URL,
        json={"query": query, "variables": variables},
        headers=HEADERS,
        timeout=15
    )
    response.raise_for_status()
    data = response.json()["data"]["user"]["followers"]

    followers = [edge["node"] for edge in data["edges"]]
    return {
        "followers": followers,
        "has_next": data["pageInfo"]["hasNextPage"],
        "cursor": data["pageInfo"]["endCursor"],
    }

Rate Limits and Anti-Bot Protections

Mixcloud's GraphQL API is relatively permissive compared to most platforms, but it does have protections:

Rate limiting: You'll start getting 429 responses if you exceed roughly 60 requests per minute. Keep it under that.
IP blocking: Sustained heavy usage from a single IP will eventually get blocked. For large-scale collection (hundreds of DJs, full catalogs), rotate your IPs. ThorData's residential proxies are a good option here — rotating residential IPs prevent the consistent-IP pattern that triggers blocks.
User-Agent validation: Send a realistic browser User-Agent. Empty or bot-like UAs get rejected.
Referer header: Include Referer: https://www.mixcloud.com/ — the API checks for it.
Origin header: Include Origin: https://www.mixcloud.com as well.

def create_session_with_proxy(proxy_url: str = None) -> httpx.Client:
    proxies = proxy_url if proxy_url else None
    return httpx.Client(
        headers=HEADERS,
        proxy=proxies,
        timeout=15,
        follow_redirects=True
    )


def safe_graphql_request(client: httpx.Client, query: str, variables: dict, retries: int = 3) -> dict:
    \"\"\"GraphQL request with retry logic.\"\"\"
    for attempt in range(retries):
        try:
            response = client.post(
                GRAPHQL_URL,
                json={"query": query, "variables": variables},
            )
            if response.status_code == 429:
                wait = 30 * (attempt + 1)
                print(f"Rate limited. Waiting {wait}s...")
                time.sleep(wait)
                continue
            response.raise_for_status()
            return response.json()
        except httpx.TimeoutException:
            print(f"Timeout on attempt {attempt + 1}")
            time.sleep(5)
    raise Exception("Max retries exceeded")


# Without proxy (fine for small jobs)
client = create_session_with_proxy()

# With rotating proxy (for bulk scraping)
client = create_session_with_proxy("http://user:[email protected]:9000")

Building a Genre Trend Tracker

Here's a practical application — tracking which genres are trending on Mixcloud by analyzing tag frequency across recent uploads:

from collections import Counter

def genre_trends(usernames: list, sets_per_user: int = 20) -> dict:
    all_tags = Counter()
    total_sets = 0

    for username in usernames:
        try:
            result = fetch_cloudcasts(username, first=sets_per_user)
            for s in result["sets"]:
                for tag in s["tags"]:
                    all_tags[tag.lower()] += 1
                total_sets += 1
        except Exception as e:
            print(f"Error for {username}: {e}")
        time.sleep(2)

    return {
        "top_genres": dict(all_tags.most_common(30)),
        "total_sets_analyzed": total_sets,
        "unique_tags": len(all_tags),
    }


# Track trends across popular DJ accounts
djs = ["Boaboradio", "NTSRadio", "RinseFM", "fabriclondon", "FABRICLIVE", "BerghainKantine"]
trends = genre_trends(djs)
print(f"Analyzed {trends['total_sets_analyzed']} sets, found {trends['unique_tags']} unique tags")
for genre, count in list(trends["top_genres"].items())[:15]:
    print(f"  {genre}: {count}")

Storing Mix Data in SQLite

For building a searchable tracklist database:

import sqlite3

def init_mixcloud_db(db_path: str = "mixcloud.db") -> sqlite3.Connection:
    conn = sqlite3.connect(db_path)

    conn.execute(\"\"\"
        CREATE TABLE IF NOT EXISTS djs (
            username TEXT PRIMARY KEY,
            display_name TEXT,
            city TEXT,
            country TEXT,
            follower_count INTEGER,
            cloudcast_count INTEGER,
            scraped_at TEXT DEFAULT CURRENT_TIMESTAMP
        )
    \"\"\")

    conn.execute(\"\"\"
        CREATE TABLE IF NOT EXISTS cloudcasts (
            slug TEXT,
            username TEXT,
            name TEXT,
            publish_date TEXT,
            duration_seconds INTEGER,
            listener_count INTEGER,
            favorite_count INTEGER,
            repost_count INTEGER,
            tags TEXT,
            scraped_at TEXT DEFAULT CURRENT_TIMESTAMP,
            PRIMARY KEY (username, slug)
        )
    \"\"\")

    conn.execute(\"\"\"
        CREATE TABLE IF NOT EXISTS tracks (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            username TEXT,
            cloudcast_slug TEXT,
            artist TEXT,
            artist_slug TEXT,
            song TEXT,
            song_slug TEXT,
            start_seconds INTEGER,
            FOREIGN KEY (username, cloudcast_slug) REFERENCES cloudcasts(username, slug)
        )
    \"\"\")

    conn.execute("CREATE INDEX IF NOT EXISTS idx_tracks_artist ON tracks(artist)")
    conn.execute("CREATE INDEX IF NOT EXISTS idx_tracks_song ON tracks(song)")
    conn.commit()
    return conn


def save_mix_to_db(conn: sqlite3.Connection, username: str, tracklist_data: dict):
    \"\"\"Save a full mix with tracklist to database.\"\"\"
    slug = tracklist_data.get("slug", "")

    conn.execute(
        "INSERT OR REPLACE INTO cloudcasts (slug, username, name, publish_date, listener_count) VALUES (?,?,?,?,?)",
        (slug, username, tracklist_data["mix_name"],
         tracklist_data.get("publish_date"), tracklist_data.get("listener_count"))
    )

    for track in tracklist_data["tracks"]:
        conn.execute(
            "INSERT INTO tracks (username, cloudcast_slug, artist, artist_slug, song, song_slug, start_seconds) VALUES (?,?,?,?,?,?,?)",
            (username, slug, track["artist"], track.get("artist_slug", ""),
             track["song"], track.get("song_slug", ""), track["start_seconds"])
        )

    conn.commit()


def find_artist_appearances(conn: sqlite3.Connection, artist_name: str) -> list[dict]:
    \"\"\"Find all mixes featuring a specific artist.\"\"\"
    rows = conn.execute(\"\"\"
        SELECT t.artist, t.song, t.start_time, c.name, c.username, c.publish_date
        FROM tracks t
        JOIN cloudcasts c ON t.username = c.username AND t.cloudcast_slug = c.slug
        WHERE t.artist LIKE ?
        ORDER BY c.publish_date DESC
    \"\"\", (f"%{artist_name}%",)).fetchall()

    return [
        {"artist": r[0], "song": r[1], "start_time": r[2],
         "mix_name": r[3], "dj": r[4], "date": r[5]}
        for r in rows
    ]

GraphQL Schema Discovery

You can introspect the schema to discover available fields:

def introspect_type(type_name: str):
    query = \"\"\"
    query IntrospectType($name: String!) {
        __type(name: $name) {
            name
            fields {
                name
                type {
                    name
                    kind
                    ofType {
                        name
                        kind
                    }
                }
                description
            }
        }
    }
    \"\"\"
    response = httpx.post(
        GRAPHQL_URL,
        json={"query": query, "variables": {"name": type_name}},
        headers=HEADERS,
        timeout=15
    )
    return response.json()


# Discover what fields are available on User type
schema = introspect_type("User")
if schema.get("data", {}).get("__type"):
    for field in schema["data"]["__type"]["fields"]:
        field_type = field["type"].get("name") or field["type"].get("ofType", {}).get("name")
        print(f"  {field['name']}: {field_type}")

Introspection might be disabled — some deployments lock it down. But it's worth trying before you reverse-engineer queries from the frontend JavaScript.

Use Cases for Mixcloud Data

DJ career analytics: Track how a DJ's listener counts grow over time. A DJ with 2x listener growth in 3 months is someone to watch (or book before their rates go up).

Track popularity signals: Songs that appear in many mixes from respected DJs are likely to chart commercially. This is a leading indicator that labels and A&R use.

Label intelligence: Filter tracklists by record label to see how effectively a label is getting their releases into DJ sets. Higher placement rate = stronger label relationships.

Event programming: Cross-reference which DJs are getting the most play at which venues by scraping venue radio shows and resident DJ pages.

Playlist seeding: Extract tracklists from genre-defining mixes to seed a Spotify or Apple Music playlist with accurately categorized tracks.

Conclusion

Mixcloud's GraphQL API is one of the friendlier scraping targets. Structured data, cursor-based pagination, and relatively light anti-bot measures make it straightforward to build datasets of DJ activity, track plays, and genre trends. Keep your request rates reasonable, use proper headers, and you'll get far with just httpx and no browser automation. For bulk collection at scale, ThorData's residential proxy rotation handles the IP throttling without manual management.

Discovering Rising DJs

Track follower growth velocity to find DJs who are gaining traction:

import sqlite3
from datetime import datetime, date

def track_dj_growth(usernames: list[str], db_path: str = "mixcloud.db"):
    """Record follower counts for DJs over time to track growth."""
    conn = init_mixcloud_db(db_path)
    conn.execute("""
        CREATE TABLE IF NOT EXISTS growth_snapshots (
            username TEXT,
            snapshot_date TEXT,
            followers INTEGER,
            cloudcast_count INTEGER,
            PRIMARY KEY (username, snapshot_date)
        )
    """)
    conn.commit()

    today = date.today().isoformat()

    for username in usernames:
        try:
            profile = fetch_user_profile(username)
            conn.execute(
                "INSERT OR REPLACE INTO growth_snapshots VALUES (?,?,?,?)",
                (username, today, profile.get("followerCount"), profile.get("cloudcastCount"))
            )
            print(f"{username}: {profile.get('followerCount'):,} followers")
        except Exception as e:
            print(f"Error for {username}: {e}")
        time.sleep(2)

    conn.commit()
    conn.close()


def find_fastest_growing(db_path: str = "mixcloud.db", min_snapshots: int = 7) -> list[dict]:
    """Find DJs with fastest follower growth rate."""
    conn = sqlite3.connect(db_path)

    rows = conn.execute("""
        SELECT username,
               MIN(followers) as min_followers,
               MAX(followers) as max_followers,
               COUNT(*) as snapshot_count,
               MIN(snapshot_date) as first_date,
               MAX(snapshot_date) as last_date
        FROM growth_snapshots
        GROUP BY username
        HAVING snapshot_count >= ?
        ORDER BY (MAX(followers) - MIN(followers)) * 1.0 / NULLIF(MIN(followers), 0) DESC
        LIMIT 20
    """, (min_snapshots,)).fetchall()

    conn.close()
    return [
        {
            "username": r[0],
            "min_followers": r[1],
            "max_followers": r[2],
            "growth_pct": round((r[3] - r[2]) / max(r[2], 1) * 100, 1) if r[2] else 0,
            "period": f"{r[4]} to {r[5]}",
        }
        for r in rows
    ]

Building a Track Popularity Index

Count how many times each track appears across all scraped mixes to build a popularity index:

def build_track_index(db_path: str = "mixcloud.db") -> pd.DataFrame:
    """Build a popularity index of tracks across all scraped mixes."""
    conn = sqlite3.connect(db_path)

    rows = conn.execute("""
        SELECT t.artist, t.song,
               COUNT(*) as appearance_count,
               COUNT(DISTINCT t.username) as unique_djs,
               MIN(c.publish_date) as first_seen,
               MAX(c.publish_date) as last_seen
        FROM tracks t
        JOIN cloudcasts c ON t.username = c.username AND t.cloudcast_slug = c.slug
        WHERE t.artist != 'Unknown' AND t.song != 'Unknown'
        GROUP BY t.artist, t.song
        HAVING appearance_count >= 2
        ORDER BY appearance_count DESC
    """).fetchall()

    conn.close()

    df = pd.DataFrame(rows, columns=["artist", "song", "appearances", "unique_djs", "first_seen", "last_seen"])
    return df


track_df = build_track_index()
print("Most-played tracks in database:")
print(track_df.head(20).to_string(index=False))

Exporting Data for External Tools

Export scraped data to formats compatible with Spotify API, music analysis tools, or streaming platforms:

def export_tracklist_for_spotify_search(cloudcast_id: tuple, db_path: str = "mixcloud.db") -> list[dict]:
    """Export a mix's tracklist formatted for Spotify API search."""
    conn = sqlite3.connect(db_path)
    username, slug = cloudcast_id

    tracks = conn.execute("""
        SELECT artist, song, start_seconds
        FROM tracks
        WHERE username = ? AND cloudcast_slug = ?
        ORDER BY start_seconds
    """, (username, slug)).fetchall()

    conn.close()

    return [
        {
            "artist": t[0],
            "track": t[1],
            "start_time": f"{t[2] // 60}:{t[2] % 60:02d}",
            "spotify_query": f"track:{t[1]} artist:{t[0]}",
        }
        for t in tracks
    ]


def export_genre_trends_csv(trends_data: dict, output_path: str):
    """Export genre trends to CSV for analysis."""
    import csv
    with open(output_path, "w", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=["genre", "count", "pct_of_total"])
        writer.writeheader()
        total = sum(trends_data["top_genres"].values())
        for genre, count in trends_data["top_genres"].items():
            writer.writerow({
                "genre": genre,
                "count": count,
                "pct_of_total": round(count / total * 100, 2),
            })
    print(f"Exported to {output_path}")

Monitoring Radio Station Archives

NTS, Rinse FM, and Boiler Room all upload regularly. Automate monitoring for new content:

def monitor_new_uploads(station_username: str, db_path: str = "mixcloud.db", lookback_days: int = 7):
    """Check for new uploads from a station in the last N days."""
    from datetime import datetime, timedelta, timezone

    cutoff = datetime.now(timezone.utc) - timedelta(days=lookback_days)
    result = fetch_cloudcasts(station_username, first=20)

    new_uploads = []
    for s in result["sets"]:
        pub_date = s.get("date")
        if pub_date:
            try:
                pub_dt = datetime.fromisoformat(pub_date.replace("Z", "+00:00"))
                if pub_dt >= cutoff:
                    new_uploads.append(s)
            except (ValueError, TypeError):
                pass

    return new_uploads


# Check NTS Radio for new shows this week
new_shows = monitor_new_uploads("NTSRadio", lookback_days=7)
print(f"NTS Radio: {len(new_shows)} new shows this week")
for show in new_shows:
    print(f"  {show['date'][:10]}: {show['name']} ({show['duration_minutes']}min, {show['listeners']} listeners)")