How to Scrape Google Play Store with Python (2026)
How to Scrape Google Play Store with Python (2026)
The Google Play Store holds data on millions of Android apps — ratings, reviews, download counts, pricing, developer info, and version history. Google doesn't provide a public API for this data, but the Play Store's web frontend uses a set of internal RPC endpoints that return structured data.
This guide shows you how to extract app information from Google Play using Python, covering: basic app details from HTML, reviews via the batchexecute RPC endpoint, developer app listings, handling anti-bot systems with proxy rotation, and exporting data to CSV and SQLite.
Environment Setup
pip install httpx beautifulsoup4 lxml
For headless browser fallback:
pip install playwright
python -m playwright install chromium
Approach 1: Basic App Details via HTML
The simplest approach is parsing the app detail page directly. Each app has a page at play.google.com/store/apps/details?id=PACKAGE_NAME:
import httpx
from bs4 import BeautifulSoup
import json
import re
import time
import random
PLAY_HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Accept-Language": "en-US,en;q=0.9",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Referer": "https://play.google.com/",
"DNT": "1",
}
def get_app_details(package_name: str, proxy_url: str = None) -> dict:
"""Fetch app details from the Play Store page."""
url = f"https://play.google.com/store/apps/details?id={package_name}&hl=en&gl=us"
kwargs = {"headers": PLAY_HEADERS, "timeout": 30, "follow_redirects": True}
if proxy_url:
kwargs["proxy"] = proxy_url
response = httpx.get(url, **kwargs)
if response.status_code == 404:
return {"error": "App not found", "package_name": package_name}
response.raise_for_status()
soup = BeautifulSoup(response.text, "lxml")
# App name
name = None
name_tag = soup.select_one("h1[itemprop='name']")
if name_tag:
name = name_tag.text.strip()
# Developer
dev_tag = soup.select_one("a[href*='/store/apps/dev']")
developer = dev_tag.text.strip() if dev_tag else None
# Ratings — try itemprop first, then data attributes
rating = None
rating_count = None
rating_tag = soup.select_one("div[itemprop='starRating'] div")
if rating_tag:
match = re.search(r"([\d.]+)", rating_tag.text)
if match:
rating = float(match.group(1))
# Extract structured data from script tags (most reliable)
structured = {}
for script in soup.find_all("script", type="application/ld+json"):
try:
data = json.loads(script.string)
if data.get("@type") == "SoftwareApplication":
structured = data
break
except (json.JSONDecodeError, TypeError):
continue
# Extract download count from page text
installs = None
for span in soup.find_all("span"):
text = span.get_text(strip=True)
if re.match(r"[\d.,]+[KMB]?\+? downloads?", text, re.IGNORECASE):
installs = text
break
# Content rating
content_rating = None
content_tag = soup.find("span", string=re.compile(r"Rated for|Everyone|Teen|Mature"))
if content_tag:
content_rating = content_tag.get_text(strip=True)
# App version and last updated
version = None
updated = None
for info_div in soup.select("div[class*='ClM7O']"): # Play Store info section class
label = info_div.find_previous_sibling()
if label:
label_text = label.get_text(strip=True).lower()
value_text = info_div.get_text(strip=True)
if "version" in label_text:
version = value_text
elif "updated" in label_text:
updated = value_text
return {
"package_name": package_name,
"name": name or structured.get("name"),
"developer": developer,
"rating": rating or structured.get("aggregateRating", {}).get("ratingValue"),
"rating_count": rating_count or structured.get("aggregateRating", {}).get("ratingCount"),
"installs": installs,
"price": structured.get("offers", {}).get("price", "Free"),
"category": structured.get("applicationCategory"),
"content_rating": content_rating,
"version": version,
"updated": updated,
"os": structured.get("operatingSystem"),
"description": structured.get("description", "")[:500] if structured.get("description") else None,
}
# Example usage
app = get_app_details("com.spotify.music")
for key, value in app.items():
if key != "description":
print(f"{key}: {value}")
Approach 2: Fetching Reviews via batchexecute
Google Play reviews are loaded dynamically through Google's batchexecute RPC system. This is the same mechanism used across Google products — Maps, Search, YouTube. The requests look strange, but they follow a consistent pattern.
import urllib.parse
def get_reviews(package_name: str, count: int = 40,
sort: int = 2, token: str = None,
proxy_url: str = None) -> tuple:
"""
Fetch reviews using Google's batchexecute endpoint.
sort: 1=newest, 2=most relevant, 3=highest rated, 4=lowest rated
Returns (reviews_list, next_page_token)
"""
# Build the protobuf-like request payload
if token:
escaped_token = token.replace('"', '\\"')\
request_payload = (
f'[[["UsvDTd","[null,null,[2,{sort},[{count},null,\\"{escaped_token}\\"],'
f'null,null,null,null,[2]],[\\"{package_name}\\",7]]",null,"generic"]]]'
)
else:
request_payload = (
f'[[["UsvDTd","[null,null,[2,{sort},[{count},null,null],'
f'null,null,null,null,[2]],[\\"{package_name}\\",7]]",null,"generic"]]]'
)
url = "https://play.google.com/_/PlayStoreUi/data/batchexecute"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Content-Type": "application/x-www-form-urlencoded;charset=utf-8",
"Accept": "*/*",
"Origin": "https://play.google.com",
"Referer": f"https://play.google.com/store/apps/details?id={package_name}",
}
params = {"hl": "en", "gl": "us"}
body = f"f.req={urllib.parse.quote(request_payload)}"
kwargs = {"headers": headers, "params": params, "content": body, "timeout": 30}
if proxy_url:
kwargs["proxy"] = proxy_url
response = httpx.post(url, **kwargs)
if response.status_code == 429:
raise Exception("Rate limited by Google Play")
response.raise_for_status()
# Parse the nested response — Google wraps responses in )]}'\n prefix
text = response.text
if text.startswith(")]}\'"):
text = text[4:].lstrip("\n")
try:
outer = json.loads(text)
inner_str = outer[0][2]
if not inner_str:
return [], None
inner = json.loads(inner_str)
reviews_raw = inner[0] or []
next_token = inner[1][1] if (inner[1] if len(inner) > 1 else None) else None
except (json.JSONDecodeError, IndexError, TypeError) as e:
print(f"Parse error: {e}")
return [], None
reviews = []
for r in reviews_raw:
try:
# Developer reply is at index 7 if present
dev_reply = None
if len(r) > 7 and r[7]:
try:
dev_reply = r[7][1]
except (IndexError, TypeError):
pass
reviews.append({
"review_id": r[0] if r[0] else None,
"author": r[1][0] if r[1] else None,
"avatar_url": r[1][1][3][2] if (r[1] and len(r[1]) > 1) else None,
"rating": r[2],
"text": r[4],
"thumbs_up": r[6],
"timestamp": r[5][0] if r[5] else None, # Unix timestamp
"app_version": r[10] if len(r) > 10 else None,
"developer_reply": dev_reply,
"reply_timestamp": r[7][2][0] if (dev_reply and len(r[7]) > 2) else None,
})
except (IndexError, TypeError):
continue
return reviews, next_token
def get_all_reviews(package_name: str, max_reviews: int = 200,
sort: int = 1, proxy_url: str = None) -> list:
"""Paginate through reviews until max_reviews or no more pages."""
all_reviews = []
token = None
while len(all_reviews) < max_reviews:
try:
reviews, token = get_reviews(
package_name, count=min(40, max_reviews - len(all_reviews)),
sort=sort, token=token, proxy_url=proxy_url
)
except Exception as e:
print(f"Error fetching reviews: {e}")
break
if not reviews:
break
all_reviews.extend(reviews)
print(f"Fetched {len(all_reviews)}/{max_reviews} reviews")
if not token:
break
time.sleep(random.uniform(1.5, 3.5))
return all_reviews[:max_reviews]
# Fetch reviews for Spotify
reviews = get_all_reviews("com.spotify.music", max_reviews=100, sort=1)
for review in reviews[:5]:
from datetime import datetime
date = datetime.fromtimestamp(review["timestamp"]).strftime("%Y-%m-%d") if review["timestamp"] else "N/A"
print(f"[{review['rating']}/5] {date} — {review['text'][:100]}...")
print(f" By: {review['author']} | Helpful: {review['thumbs_up']}")
if review["developer_reply"]:
print(f" Dev reply: {review['developer_reply'][:80]}...")
print()
Approach 3: Batch Scraping Multiple Apps
from concurrent.futures import ThreadPoolExecutor, as_completed
def scrape_app_full(package_name: str, max_review_pages: int = 3,
proxy_url: str = None) -> dict:
"""Full app scrape — details + reviews."""
details = get_app_details(package_name, proxy_url=proxy_url)
if "error" in details:
return details
all_reviews = get_all_reviews(
package_name, max_reviews=max_review_pages * 40,
sort=1, proxy_url=proxy_url
)
details["reviews"] = all_reviews
details["review_count_scraped"] = len(all_reviews)
return details
def batch_scrape(package_names: list, proxy_url: str = None,
max_workers: int = 3) -> list:
"""Scrape multiple apps in parallel (limited concurrency)."""
results = []
def scrape_one(pkg):
time.sleep(random.uniform(0.5, 2.0)) # stagger starts
return scrape_app_full(pkg, max_review_pages=2, proxy_url=proxy_url)
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = {executor.submit(scrape_one, pkg): pkg for pkg in package_names}
for future in as_completed(futures):
pkg = futures[future]
try:
result = future.result()
results.append(result)
print(f"Done: {pkg} — {result.get('name')}, {result.get('review_count_scraped', 0)} reviews")
except Exception as e:
print(f"Failed: {pkg} — {e}")
results.append({"package_name": pkg, "error": str(e)})
return results
Scraping Developer App Lists
def get_developer_apps(developer_id: str, proxy_url: str = None) -> list:
"""Get all apps from a developer's page."""
url = f"https://play.google.com/store/apps/dev?id={developer_id}&hl=en&gl=us"
kwargs = {"headers": PLAY_HEADERS, "timeout": 30, "follow_redirects": True}
if proxy_url:
kwargs["proxy"] = proxy_url
response = httpx.get(url, **kwargs)
response.raise_for_status()
soup = BeautifulSoup(response.text, "lxml")
apps = []
seen = set()
for link in soup.select("a[href*='/store/apps/details']"):
href = link.get("href", "")
match = re.search(r"id=([^&]+)", href)
if match:
package = match.group(1)
if package not in seen:
seen.add(package)
apps.append({
"package_name": package,
"name": link.text.strip() or package,
})
return apps
# Google LLC developer ID
apps = get_developer_apps("5700313618786177705")
print(f"Found {len(apps)} apps")
for app in apps[:10]:
print(f" {app['name']} ({app['package_name']})")
Category Browsing
PLAY_CATEGORIES = {
"games": "GAME",
"productivity": "PRODUCTIVITY",
"social": "SOCIAL",
"tools": "TOOLS",
"entertainment": "ENTERTAINMENT",
"education": "EDUCATION",
"shopping": "SHOPPING",
"finance": "FINANCE",
"health": "HEALTH_AND_FITNESS",
"travel": "TRAVEL_AND_LOCAL",
}
def get_category_apps(category: str, collection: str = "topselling_free",
num_results: int = 100, proxy_url: str = None) -> list:
"""
Get top apps from a category.
collection: topselling_free, topselling_paid, topgrossing, movers_shakers
"""
cat_code = PLAY_CATEGORIES.get(category.lower(), category.upper())
url = (f"https://play.google.com/store/apps/category/{cat_code}"
f"/collection/{collection}?hl=en&gl=us&num={num_results}")
kwargs = {"headers": PLAY_HEADERS, "timeout": 30, "follow_redirects": True}
if proxy_url:
kwargs["proxy"] = proxy_url
response = httpx.get(url, **kwargs)
response.raise_for_status()
soup = BeautifulSoup(response.text, "lxml")
apps = []
seen = set()
for link in soup.select("a[href*='/store/apps/details']"):
href = link.get("href", "")
match = re.search(r"id=([^&]+)", href)
if match:
package = match.group(1)
if package not in seen:
seen.add(package)
apps.append(package)
return apps[:num_results]
Anti-Bot Protections and Proxy Usage
Google has multiple layers of bot detection on the Play Store:
Rate limiting. The HTML pages and batchexecute endpoint both throttle aggressively. After ~50 requests per minute from one IP, you'll start getting 429 responses or empty results.
Cookie consent and redirects. Google redirects to consent pages in some regions. Always set gl=us and hl=en parameters and handle redirects with follow_redirects=True.
JavaScript rendering. Some app data is loaded client-side only. The batchexecute approach above avoids this, but if you need data that only appears after JS execution, you'll need a headless browser.
IP reputation. Google maintains blocklists of datacenter IP ranges. Cloud server IPs get challenged or blocked quickly.
For sustained scraping, rotating residential proxies are essential. ThorData's proxy network gives you access to residential IPs that pass Google's reputation checks — the same kind of IPs real Android users browse from:
# ThorData proxy setup for Google Play
THORDATA_USER = "your_username"
THORDATA_PASS = "your_password"
def get_proxy(country: str = "us") -> str:
return f"http://{THORDATA_USER}-country-{country}:{THORDATA_PASS}@proxy.thordata.net:9000"
# US residential proxy
us_proxy = get_proxy("us")
client = httpx.Client(
headers=PLAY_HEADERS,
proxy=us_proxy,
timeout=30,
follow_redirects=True,
)
# Route all requests through US residential IPs
response = client.get(
"https://play.google.com/store/apps/details?id=com.spotify.music&hl=en&gl=us"
)
Exporting Data
import csv
import sqlite3
from datetime import datetime
def save_reviews_csv(reviews: list, filename: str):
if not reviews:
return
fieldnames = ["review_id", "author", "rating", "text", "thumbs_up",
"timestamp", "app_version", "developer_reply"]
with open(filename, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction="ignore")
writer.writeheader()
writer.writerows(reviews)
print(f"Saved {len(reviews)} reviews to {filename}")
def setup_play_db(db_path="play_store.db"):
conn = sqlite3.connect(db_path)
conn.execute("""
CREATE TABLE IF NOT EXISTS apps (
package_name TEXT PRIMARY KEY,
name TEXT, developer TEXT, rating REAL,
rating_count INTEGER, installs TEXT, price TEXT,
category TEXT, content_rating TEXT,
version TEXT, updated TEXT, scraped_at TEXT
)
""")
conn.execute("""
CREATE TABLE IF NOT EXISTS reviews (
review_id TEXT,
package_name TEXT,
author TEXT, rating INTEGER, text TEXT,
thumbs_up INTEGER, timestamp INTEGER,
app_version TEXT, developer_reply TEXT,
scraped_at TEXT,
PRIMARY KEY (review_id, package_name)
)
""")
conn.commit()
return conn
def save_app_to_db(conn, app_data: dict):
now = datetime.utcnow().isoformat()
reviews = app_data.pop("reviews", [])
conn.execute("""
INSERT OR REPLACE INTO apps
(package_name, name, developer, rating, rating_count, installs,
price, category, content_rating, version, updated, scraped_at)
VALUES (?,?,?,?,?,?,?,?,?,?,?,?)
""", (
app_data.get("package_name"), app_data.get("name"),
app_data.get("developer"), app_data.get("rating"),
app_data.get("rating_count"), app_data.get("installs"),
app_data.get("price"), app_data.get("category"),
app_data.get("content_rating"), app_data.get("version"),
app_data.get("updated"), now
))
for r in reviews:
conn.execute("""
INSERT OR IGNORE INTO reviews
(review_id, package_name, author, rating, text, thumbs_up,
timestamp, app_version, developer_reply, scraped_at)
VALUES (?,?,?,?,?,?,?,?,?,?)
""", (
r.get("review_id"), app_data.get("package_name"),
r.get("author"), r.get("rating"), r.get("text"),
r.get("thumbs_up"), r.get("timestamp"),
r.get("app_version"), r.get("developer_reply"), now
))
conn.commit()
app_data["reviews"] = reviews # put it back
Complete Pipeline Example
def main():
# Target apps for competitive analysis
competitors = [
"com.spotify.music",
"com.apple.android.music",
"com.amazon.mp3",
"com.pandora.android",
"com.soundcloud.android",
]
proxy_url = get_proxy("us") # US residential IP
conn = setup_play_db()
for package in competitors:
print(f"\nScraping {package}...")
try:
app_data = scrape_app_full(package, max_review_pages=2, proxy_url=proxy_url)
if "error" not in app_data:
save_app_to_db(conn, app_data)
print(f" {app_data['name']}: {app_data['rating']}/5, "
f"{app_data['review_count_scraped']} reviews")
except Exception as e:
print(f" Failed: {e}")
time.sleep(random.uniform(3, 8))
conn.close()
print("\nDone!")
if __name__ == "__main__":
main()
Practical Tips
- Use the structured data first. The
ld+jsonon app pages gives you clean data without parsing HTML. - The batchexecute format changes. Google updates the RPC IDs (
UsvDTd) occasionally. If reviews stop working, capture a fresh request from the browser DevTools Network tab. - Set locale parameters. Always include
hl=en&gl=usto get consistent English results and avoid consent redirects. - Prefer httpx over requests. httpx supports HTTP/2, which looks more like a real browser.
- Use residential proxies for scale. Google blocks datacenter ranges. ThorData provides residential IPs that work reliably.
- Don't parallelize aggressively. 3 concurrent workers max per IP; spread requests with random delays.
Google Play data powers competitive analysis, app market research, and review monitoring — there's real business value in this data and Python's tooling makes it accessible even though Google doesn't offer a public API.