How to Scrape Microsoft Store Apps: Metadata, Ratings & Download Estimates (2026)

2026-04-09 [python microsoft-store scraping windows apps]

The Microsoft Store lists over 1.4 million apps and games. Each listing exposes title, description, ratings, review counts, publisher info, screenshots, category, and pricing. Microsoft doesn't offer a public API for this data, so scraping is the path.

Two approaches work reliably: web scraping store pages (fine for one-off lookups, breaks when markup changes) and the DisplayCatalog API (unofficial but stable JSON endpoint the store frontend uses internally — better for bulk work). This guide covers both, plus how to discover app IDs at scale, handle rate limiting, and store results for ongoing monitoring.

Understanding the Microsoft Store Structure

Every Microsoft Store app has a product ID — a string like 9NZVDKPMR9RD (VLC). The app page URL follows the pattern https://apps.microsoft.com/detail/PRODUCT_ID. There are also category pages, search pages, and curated collections.

The store frontend is a React SPA that fetches data from several Microsoft-internal endpoints. The key ones: - displaycatalog.mp.microsoft.com — main catalog API for product details - apps.microsoft.com/api/products/search — search - storedgists.mp.microsoft.com — ratings and review aggregates - apps.microsoft.com/store/api/products — category listings

Approach 1: Web Scraping Store Pages

Every Microsoft Store app page at apps.microsoft.com/detail/PRODUCT_ID is server-rendered HTML. A plain requests call gets you usable content.

import requests
from bs4 import BeautifulSoup
import json
import re

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/124.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Encoding": "gzip, deflate, br",
}


def scrape_store_page(product_id: str) -> dict:
    """Scrape a Microsoft Store app page for basic metadata."""
    url = f"https://apps.microsoft.com/detail/{product_id}"
    resp = requests.get(url, headers=HEADERS, timeout=15)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    def text(selector):
        el = soup.select_one(selector)
        return el.get_text(strip=True) if el else None

    # Also try to extract JSON-LD structured data
    schema_data = {}
    for script in soup.find_all("script", type="application/ld+json"):
        try:
            data = json.loads(script.string)
            if data.get("@type") in ("SoftwareApplication", "Product"):
                schema_data = data
                break
        except (json.JSONDecodeError, TypeError):
            pass

    # Try Next.js __NEXT_DATA__ for rich data
    next_data = {}
    next_script = soup.find("script", id="__NEXT_DATA__")
    if next_script:
        try:
            next_data = json.loads(next_script.string)
        except (json.JSONDecodeError, TypeError):
            pass

    title = (
        schema_data.get("name")
        or text("h1")
        or text('[data-testid="app-title"]')
    )

    rating = (
        str(schema_data.get("aggregateRating", {}).get("ratingValue", ""))
        or text('[data-testid="rating-value"]')
    )

    screenshots = [
        img["src"]
        for img in soup.select('[data-testid="screenshot"] img, .screenshot img')
        if img.get("src")
    ]

    return {
        "product_id": product_id,
        "title": title,
        "description": schema_data.get("description") or text('[data-testid="app-description"]'),
        "publisher": schema_data.get("author", {}).get("name") if isinstance(schema_data.get("author"), dict) else text('[data-testid="publisher-name"]'),
        "category": schema_data.get("applicationCategory") or text('[data-testid="app-category"]'),
        "rating": rating,
        "review_count": str(schema_data.get("aggregateRating", {}).get("reviewCount", "")) or text('[data-testid="rating-count"]'),
        "price": schema_data.get("offers", {}).get("price") if isinstance(schema_data.get("offers"), dict) else None,
        "screenshots": screenshots,
        "url": url,
        "schema_data": schema_data,
    }

Approach 2: The DisplayCatalog API (Recommended for Bulk)

The DisplayCatalog endpoint returns a structured JSON response with far more fields than HTML exposes — age ratings, supported platforms, system requirements, pricing details, and localized metadata.

import requests
import time
import random

CATALOG_URL = "https://displaycatalog.mp.microsoft.com/v7.0/products"


def fetch_app_catalog(
    product_id: str,
    market: str = "US",
    language: str = "en-US"
) -> dict:
    """Fetch full app metadata from the DisplayCatalog API."""
    params = {
        "productId": product_id,
        "market": market,
        "languages": language,
        "MS-CV": "DGU1mcuYo0WMMp+F.1",
    }
    resp = requests.get(CATALOG_URL, params=params, headers=HEADERS, timeout=15)
    resp.raise_for_status()
    return resp.json()


def fetch_multiple_apps(
    product_ids: list[str],
    market: str = "US"
) -> dict:
    """Fetch up to 20 apps in a single API request."""
    # The API accepts comma-separated product IDs
    params = {
        "productId": ",".join(product_ids[:20]),  # max 20 per request
        "market": market,
        "languages": "en-US",
        "MS-CV": "DGU1mcuYo0WMMp+F.1",
    }
    resp = requests.get(CATALOG_URL, params=params, headers=HEADERS, timeout=20)
    resp.raise_for_status()
    return resp.json()


def parse_catalog_response(data: dict) -> dict:
    """Parse the DisplayCatalog API response into a clean structure."""
    product = data.get("Product", {})
    local = product.get("LocalizedProperties", [{}])[0]
    display_sku = product.get("DisplaySkuAvailabilities", [{}])[0]
    sku = display_sku.get("Sku", {})
    sku_local = sku.get("LocalizedProperties", [{}])[0]
    market_props = product.get("MarketProperties", [{}])[0]
    usage_data = market_props.get("UsageData", [{}])

    # Extract ratings from UsageData
    avg_rating = None
    rating_count = None
    for item in usage_data:
        if item.get("AggregatedClientRating"):
            avg_rating = item.get("AggregatedClientRating")
            rating_count = item.get("RatingCount")
            break

    # Extract images by purpose
    images = local.get("Images", [])
    icon = next((img.get("Uri") for img in images if img.get("ImagePurpose") == "Tile"), None)
    screenshots = [img.get("Uri") for img in images if img.get("ImagePurpose") == "Screenshot"]
    hero = next((img.get("Uri") for img in images if img.get("ImagePurpose") == "SuperHeroArt"), None)

    # System requirements
    sys_reqs = sku.get("Properties", {}).get("SystemRequirements", [])
    requirements = {}
    for req in sys_reqs:
        requirements[req.get("Title", "").lower()] = req.get("Items", [])

    # Pricing
    availabilities = display_sku.get("Availabilities", [{}])
    price = None
    for avail in availabilities:
        order_terms = avail.get("OrderManagementData", {}).get("Price", {})
        if order_terms:
            price = {
                "amount": order_terms.get("ListPrice"),
                "currency": order_terms.get("CurrencyCode"),
                "msrp": order_terms.get("MSRP"),
            }
            break

    return {
        "product_id": product.get("ProductId"),
        "title": local.get("ProductTitle"),
        "description": local.get("ProductDescription"),
        "short_description": local.get("ShortDescription"),
        "publisher": local.get("PublisherName"),
        "publisher_id": product.get("PublisherId"),
        "publisher_website": local.get("PublisherWebsiteUri"),
        "developer": local.get("DeveloperName"),
        "category": product.get("Properties", {}).get("Category"),
        "subcategory": product.get("Properties", {}).get("SubCategory"),
        "age_rating": product.get("Properties", {}).get("AgeRating"),
        "content_ratings": product.get("Properties", {}).get("ContentRatings", []),
        "rating": avg_rating,
        "rating_count": rating_count,
        "release_date": market_props.get("OriginalReleaseDate"),
        "last_update": market_props.get("MainPublishDate"),
        "languages": local.get("Languages", []),
        "platforms": product.get("Properties", {}).get("AllowedPlatforms", []),
        "price": price,
        "sku_title": sku_local.get("SkuTitle"),
        "sku_description": sku_local.get("SkuDescription"),
        "keywords": local.get("SearchTitles", []),
        "icon": icon,
        "hero": hero,
        "screenshots": screenshots,
        "requirements": requirements,
        "tags": product.get("Tags", []),
    }

Discovering Apps at Scale

Method 1: Search API

def search_apps(
    query: str,
    client: requests.Session,
    market: str = "US",
    max_results: int = 200
) -> list[str]:
    """Search the Microsoft Store and return product IDs."""
    url = "https://apps.microsoft.com/api/products/search"
    product_ids = []
    skip = 0
    page_size = 50

    while len(product_ids) < max_results:
        params = {
            "query": query,
            "market": market,
            "locale": "en-US",
            "mediaType": "apps",
            "age": "all",
            "pageSize": page_size,
            "skipItems": skip,
        }
        resp = client.get(url, params=params, timeout=15)
        if resp.status_code != 200:
            break

        data = resp.json()
        items = data.get("items", [])
        if not items:
            break

        product_ids.extend(item["productId"] for item in items)
        skip += page_size
        total = data.get("totalCount", 0)
        if skip >= total:
            break

        time.sleep(random.uniform(1, 2.5))

    return product_ids[:max_results]

Method 2: Category Browsing

Microsoft Store categories return paginated app listings via XHR. Intercept these in DevTools while browsing the store. A typical category URL:

def browse_category(
    category_id: str,
    client: requests.Session,
    market: str = "US",
    max_items: int = 500
) -> list[str]:
    """Browse a store category and collect product IDs."""
    url = "https://apps.microsoft.com/store/api/products"
    product_ids = []
    page = 1

    while len(product_ids) < max_items:
        params = {
            "category": category_id,
            "market": market,
            "locale": "en-US",
            "page": page,
            "pageSize": 40,
        }
        resp = client.get(url, params=params, timeout=15)
        if resp.status_code != 200:
            break

        data = resp.json()
        items = data.get("items", [])
        if not items:
            break

        product_ids.extend(i.get("productId") for i in items if i.get("productId"))
        page += 1
        if page * 40 >= data.get("totalCount", 0):
            break

        time.sleep(random.uniform(1, 2))

    return product_ids[:max_items]

Category IDs include: "Games", "Productivity", "Entertainment", "Developer+tools", "Photo+%26+video", "Music", "Education", "Utilities+%26+tools".

Method 3: Scraping the Sitemap

Microsoft publishes a sitemap at https://apps.microsoft.com/sitemap.xml that contains every app URL. Parse it to get a complete product ID list:

from xml.etree import ElementTree

def parse_sitemap(client: requests.Session) -> list[str]:
    """Extract all product IDs from the Microsoft Store sitemap."""
    resp = client.get("https://apps.microsoft.com/sitemap.xml", timeout=30)
    resp.raise_for_status()

    root = ElementTree.fromstring(resp.content)
    ns = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}

    product_ids = []
    for url_el in root.findall(".//sm:loc", ns):
        url = url_el.text or ""
        if "/detail/" in url:
            pid = url.split("/detail/")[-1].split("?")[0].strip()
            if pid:
                product_ids.append(pid)

    return product_ids

Anti-Bot Measures and Proxy Setup

Microsoft doesn't use Cloudflare, but runs its own rate limiting and IP-level blocking on store endpoints. During light, single-IP usage you won't hit problems. The moment you run concurrent requests across hundreds of apps, you'll see 429s and eventually hard blocks.

Three factors that trigger blocks: high request rate from a single IP, missing or inconsistent browser headers, and datacenter IP ranges (AWS, GCP, Azure ranges are flagged quickly — ironic given Microsoft's own cloud).

For production-scale scraping, rotate residential IPs. ThorData provides rotating residential proxies with city-level targeting, useful for US-specific results:

import requests

def create_session(proxy_url: str | None = None) -> requests.Session:
    """Create a requests session with browser-realistic settings."""
    session = requests.Session()
    session.headers.update(HEADERS)
    if proxy_url:
        session.proxies = {
            "http": proxy_url,
            "https": proxy_url,
        }
    return session

# Usage
PROXY_URL = "http://USER:[email protected]:9000"
session = create_session(PROXY_URL)

Rate limiting rules: - Keep requests under 1 per second per IP, even with rotation - Add time.sleep(random.uniform(1.0, 2.5)) between catalog API calls - For search/browse: minimum 2 seconds between page requests - For the DisplayCatalog API batch endpoint (20 products per request): minimum 3 seconds between batches

Handling 429 Responses

def resilient_catalog_fetch(
    product_id: str,
    session: requests.Session,
    max_retries: int = 5
) -> dict | None:
    """Fetch catalog data with exponential backoff on rate limits."""
    for attempt in range(max_retries):
        try:
            resp = session.get(
                CATALOG_URL,
                params={
                    "productId": product_id,
                    "market": "US",
                    "languages": "en-US",
                    "MS-CV": "DGU1mcuYo0WMMp+F.1",
                },
                timeout=15,
            )
            if resp.status_code == 429:
                # Check Retry-After header
                retry_after = int(resp.headers.get("Retry-After", 2 ** attempt))
                wait = retry_after + random.uniform(0, 2)
                print(f"Rate limited. Waiting {wait:.1f}s...")
                time.sleep(wait)
                continue
            if resp.status_code == 404:
                return None  # App not found or removed
            resp.raise_for_status()
            return resp.json()
        except requests.RequestException as e:
            if attempt == max_retries - 1:
                print(f"Failed after {max_retries} attempts: {e}")
                return None
            time.sleep(2 ** attempt)
    return None

Storing Results and Tracking Changes

SQLite is enough for most monitoring tasks. Store with a fetched timestamp so you can detect rating and download count changes over time:

import sqlite3
import json
from datetime import datetime, timezone


def init_db(path: str = "msstore.db") -> sqlite3.Connection:
    conn = sqlite3.connect(path)
    conn.executescript("""
        CREATE TABLE IF NOT EXISTS apps (
            product_id TEXT NOT NULL,
            fetched_at TEXT NOT NULL,
            title TEXT,
            publisher TEXT,
            publisher_id TEXT,
            category TEXT,
            subcategory TEXT,
            age_rating TEXT,
            rating REAL,
            rating_count INTEGER,
            release_date TEXT,
            last_update TEXT,
            price_amount REAL,
            price_currency TEXT,
            raw_json TEXT,
            PRIMARY KEY (product_id, fetched_at)
        );

        CREATE INDEX IF NOT EXISTS idx_product ON apps(product_id);
        CREATE INDEX IF NOT EXISTS idx_category ON apps(category);
        CREATE INDEX IF NOT EXISTS idx_rating ON apps(rating DESC);
    """)
    conn.commit()
    return conn


def save_app(conn: sqlite3.Connection, parsed: dict, raw: dict) -> None:
    price = parsed.get("price") or {}
    conn.execute("""
        INSERT OR REPLACE INTO apps (
            product_id, fetched_at, title, publisher, publisher_id,
            category, subcategory, age_rating, rating, rating_count,
            release_date, last_update, price_amount, price_currency, raw_json
        ) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
    """, (
        parsed["product_id"],
        datetime.now(timezone.utc).isoformat(),
        parsed["title"],
        parsed["publisher"],
        parsed.get("publisher_id"),
        parsed["category"],
        parsed.get("subcategory"),
        parsed.get("age_rating"),
        parsed["rating"],
        parsed["rating_count"],
        parsed.get("release_date"),
        parsed.get("last_update"),
        price.get("amount"),
        price.get("currency"),
        json.dumps(raw),
    ))
    conn.commit()


def get_rating_history(
    conn: sqlite3.Connection,
    product_id: str
) -> list[dict]:
    """Get time-series rating data for an app."""
    rows = conn.execute(
        "SELECT fetched_at, rating, rating_count FROM apps "
        "WHERE product_id = ? ORDER BY fetched_at",
        (product_id,)
    ).fetchall()
    return [
        {"date": r[0], "rating": r[1], "count": r[2]}
        for r in rows
    ]

Bulk Scraping Pipeline

Putting it all together for a full category scrape:

def scrape_category_full(
    category_id: str,
    proxy_url: str | None = None,
    max_apps: int = 1000
) -> None:
    """Complete pipeline: discover apps, fetch details, save to DB."""
    conn = init_db("msstore.db")
    session = create_session(proxy_url)

    print(f"Discovering apps in category: {category_id}")
    product_ids = browse_category(category_id, session, max_items=max_apps)
    print(f"Found {len(product_ids)} apps")

    # Batch in groups of 20 (API supports it)
    batch_size = 20
    for i in range(0, len(product_ids), batch_size):
        batch = product_ids[i:i + batch_size]
        print(f"Processing batch {i // batch_size + 1}: {batch}")

        try:
            raw_data = fetch_multiple_apps(batch, market="US")
            # Parse each product from the batch response
            for product_data in raw_data.get("Products", []):
                try:
                    parsed = parse_catalog_response({"Product": product_data})
                    save_app(conn, parsed, product_data)
                    print(f"  Saved: {parsed.get('title')} — rating: {parsed.get('rating')}")
                except Exception as e:
                    print(f"  Parse error: {e}")
        except requests.HTTPError as e:
            print(f"Batch error: {e}")

        time.sleep(random.uniform(3, 6))

    print(f"\nDone. Database: msstore.db")


if __name__ == "__main__":
    PROXY_URL = "http://USER:[email protected]:9000"
    scrape_category_full("Productivity", proxy_url=PROXY_URL, max_apps=500)

Monitoring Rating Changes

Run a daily cron job and alert when ratings shift significantly:

def check_rating_drops(
    conn: sqlite3.Connection,
    threshold: float = 0.2
) -> list[dict]:
    """Find apps whose rating dropped by threshold since last snapshot."""
    rows = conn.execute("""
        WITH ranked AS (
            SELECT
                product_id,
                title,
                rating,
                fetched_at,
                LAG(rating) OVER (PARTITION BY product_id ORDER BY fetched_at) as prev_rating
            FROM apps
        )
        SELECT product_id, title, prev_rating, rating,
               (prev_rating - rating) as drop
        FROM ranked
        WHERE prev_rating IS NOT NULL
          AND (prev_rating - rating) >= ?
        ORDER BY drop DESC
    """, (threshold,)).fetchall()

    return [
        {
            "product_id": r[0],
            "title": r[1],
            "prev_rating": r[2],
            "current_rating": r[3],
            "drop": r[4],
        }
        for r in rows
    ]

Legal Notes

Scraping publicly visible Microsoft Store pages is generally permissible for research and personal use under the hiQ v. LinkedIn legal framework. Microsoft's Terms of Use prohibit automated access without permission, so commercial products built on this data carry exposure. The DisplayCatalog API is unofficial — it's designed for the store frontend, not third-party use. Personal research is low risk. Avoid redistributing user-generated review content verbatim. Rate-limit reasonably — and note the irony of using Microsoft Azure residential IPs to scrape Microsoft's own store.