Scraping Foursquare Venue Data with Python (2026)
Scraping Foursquare Venue Data with Python (2026)
Foursquare's Places API is one of the best sources for venue data on the internet — restaurants, shops, hotels, gyms, attractions, anything with a physical address. They have 100M+ places globally with categories, ratings, user-generated photos, tips, hours of operation, and popularity by time of day.
The v3 API is well-designed, free up to 200K calls per month, and gives you structured JSON for most of the data you need. This guide covers the full workflow: API setup, searching venues, enriching details, bulk city scanning, web scraping for data outside the API, exporting to multiple formats, and real-world use cases for location data projects.
Getting Started: API Key Setup
You need a Foursquare developer account and API key. Sign up at developer.foursquare.com, create a project (free), and grab the API key.
The v3 API uses a single Authorization header — much cleaner than the old OAuth v2 client_id/client_secret pattern.
import httpx
import requests
import time
import random
import json
import csv
import sqlite3
import math
from pathlib import Path
from datetime import datetime, timezone
from itertools import product as cartesian_product
from bs4 import BeautifulSoup
FOURSQUARE_API_KEY = "YOUR_API_KEY_HERE"
client = httpx.Client(
base_url="https://api.foursquare.com/v3",
headers={
"Authorization": FOURSQUARE_API_KEY,
"Accept": "application/json",
},
timeout=20,
)
Understanding the v3 API Structure
The Foursquare Places API v3 has three core endpoints:
| Endpoint | Purpose | Returns |
|---|---|---|
/places/search |
Search by keyword, location, category | Up to 50 results per query |
/places/{fsq_id} |
Full details for one venue | All available fields |
/places/nearby |
Venues near a lat/lng (no query) | Up to 50 results |
The fsq_id is a unique venue identifier like 4b058804f964a520c4f722e3. It is stable across requests and is your primary key for building a database.
Searching for Venues
The /places/search endpoint is your starting point. You can search by text query, location, category ID, or any combination.
# All available fields you can request
ALL_FIELDS = (
"fsq_id,name,location,categories,chains,distance,"
"rating,stats,hours,hours_popular,price,tips,tastes,"
"popularity,website,tel,social_media,description,"
"features,store_id,verified,photos"
)
def search_venues(
query: str = "",
lat: float = None,
lng: float = None,
radius: int = 5000,
categories: str = "",
limit: int = 50,
fields: str = ALL_FIELDS,
) -> list:
"""
Search for venues using the Foursquare Places API.
Args:
query: Text search string (e.g. "coffee", "sushi restaurant")
lat, lng: Center point of search area
radius: Search radius in meters (max 100000)
categories: Comma-separated Foursquare category IDs
limit: Results per request (max 50)
fields: Comma-separated field names to return
Returns:
List of venue dicts
"""
params = {"limit": min(limit, 50)}
if query:
params["query"] = query
if lat is not None and lng is not None:
params["ll"] = f"{lat},{lng}"
if radius:
params["radius"] = radius
if categories:
params["categories"] = categories
if fields:
params["fields"] = fields
r = client.get("/places/search", params=params)
r.raise_for_status()
return r.json().get("results", [])
# Coffee shops in central London (lat/lng = city center)
venues = search_venues(
query="coffee",
lat=51.5074,
lng=-0.1278,
radius=2000,
)
print(f"Found {len(venues)} coffee shops in central London")
for v in venues[:5]:
name = v.get("name", "Unknown")
rating = v.get("rating", "N/A")
addr = v.get("location", {}).get("formatted_address", "")
cats = [c["name"] for c in v.get("categories", [])]
print(f" {name} - Rating: {rating}/10 - {addr} - {cats}")
Robust Request Handling with Retry Logic
For production use, wrap API calls with retry logic and rate limit handling:
def safe_api_call(endpoint, params=None, max_retries=5):
"""
Make a Foursquare API call with retry logic and exponential backoff.
Args:
endpoint: API path (e.g. "/places/search")
params: Query parameters dict
max_retries: Maximum retry attempts
Returns:
Response JSON dict, or None on failure
"""
for attempt in range(max_retries):
try:
r = client.get(endpoint, params=params)
if r.status_code == 429:
# Rate limited — check headers for retry guidance
retry_after = int(r.headers.get("Retry-After", 60))
print(f"Rate limited. Waiting {retry_after}s (attempt {attempt+1}/{max_retries})")
time.sleep(retry_after)
continue
if r.status_code == 404:
return None
if r.status_code >= 500:
wait = (2 ** attempt) + random.uniform(0, 1)
print(f"Server error {r.status_code}. Retry in {wait:.1f}s...")
time.sleep(wait)
continue
r.raise_for_status()
return r.json()
except httpx.ConnectError:
wait = (2 ** attempt) + random.uniform(0, 2)
print(f"Connection error. Retry {attempt+1}/{max_retries} in {wait:.1f}s...")
time.sleep(wait)
except httpx.TimeoutException:
print(f"Timeout on attempt {attempt+1}")
time.sleep(5)
print(f"All {max_retries} attempts failed for {endpoint}")
return None
Parsing Venue Data
The API returns nested JSON. Here is a comprehensive parser that flattens the structure for database storage:
def parse_venue(v: dict) -> dict:
"""
Flatten a Foursquare venue object into a flat dict for storage.
Handles missing fields gracefully.
"""
location = v.get("location", {})
categories = v.get("categories", [])
tips = v.get("tips", [])
chains = v.get("chains", [])
stats = v.get("stats", {})
social = v.get("social_media", {})
features = v.get("features", {})
photos = v.get("photos", [])
# Category names and IDs
cat_names = [c.get("name", "") for c in categories]
cat_ids = [str(c.get("id", "")) for c in categories]
# Main category icon URL
main_cat_icon = None
if categories:
icon = categories[0].get("icon", {})
if icon:
prefix = icon.get("prefix", "")
suffix = icon.get("suffix", "")
if prefix and suffix:
main_cat_icon = f"{prefix}64{suffix}"
# Popular hours
hours_popular = v.get("hours_popular", [])
# Top tip text
top_tip = tips[0].get("text", "") if tips else ""
# Photo URL
first_photo_url = None
if photos:
ph = photos[0]
prefix = ph.get("prefix", "")
suffix = ph.get("suffix", "")
if prefix and suffix:
first_photo_url = f"{prefix}original{suffix}"
return {
# Identifiers
"fsq_id": v.get("fsq_id", ""),
"name": v.get("name", ""),
"verified": v.get("verified", False),
# Location
"address": location.get("formatted_address", ""),
"address_line1": location.get("address", ""),
"city": location.get("locality", ""),
"state": location.get("region", ""),
"country": location.get("country", ""),
"postcode": location.get("postcode", ""),
"lat": location.get("latitude"),
"lng": location.get("longitude"),
"neighborhood": ", ".join(location.get("neighborhood", [])),
"cross_street": location.get("cross_street", ""),
# Classification
"categories": ", ".join(cat_names),
"category_ids": ", ".join(cat_ids),
"main_category_icon": main_cat_icon,
"chain": chains[0].get("name", "") if chains else "",
# Quality signals
"rating": v.get("rating"),
"price": v.get("price"), # 1-4 scale
"popularity": v.get("popularity"), # float 0-1
# Contact
"website": v.get("website", ""),
"phone": v.get("tel", ""),
"twitter": social.get("twitter", ""),
"facebook_id": social.get("facebookUrl", ""),
# Content
"description": v.get("description", ""),
"top_tip": top_tip,
"tip_count": len(tips),
"first_photo_url": first_photo_url,
# Stats
"total_checkins": stats.get("total_checkins", 0),
"total_tips": stats.get("total_tips", 0),
"total_ratings": stats.get("total_ratings", 0),
"total_photos": stats.get("total_photos", 0),
# Hours
"is_open_now": v.get("hours", {}).get("open_now"),
"hours_display": v.get("hours", {}).get("display", ""),
"popular_hours_count": len(hours_popular),
# Features
"has_wifi": features.get("amenities", {}).get("wifi"),
"wheelchair_accessible": features.get("amenities", {}).get("wheelchair_accessible"),
"outdoor_seating": features.get("amenities", {}).get("outdoor_seating"),
"serves_food": features.get("food_and_drink", {}).get("meals", {}).get("lunch"),
# Store identifier
"store_id": v.get("store_id", ""),
# Metadata
"collected_at": datetime.now(timezone.utc).isoformat(),
}
Getting Full Venue Details
The search endpoint gives you the basics. For the complete data set — tips, photos, hours by day, popularity curves — use the detail endpoint:
def get_venue_details(fsq_id: str, fields: str = ALL_FIELDS) -> dict | None:
"""Fetch complete details for a single venue by its Foursquare ID."""
r = client.get(f"/places/{fsq_id}", params={"fields": fields})
if r.status_code == 404:
return None
r.raise_for_status()
return r.json()
def enrich_venues(venues: list, delay: float = 0.5) -> list:
"""
Enrich search results with full venue details.
Adds a polite delay between requests to respect rate limits.
"""
enriched = []
for i, v in enumerate(venues):
fsq_id = v.get("fsq_id")
if not fsq_id:
continue
details = get_venue_details(fsq_id)
if details:
enriched.append(parse_venue(details))
if (i + 1) % 10 == 0:
print(f"Enriched {i + 1}/{len(venues)} venues...")
time.sleep(delay)
return enriched
# Full workflow: search then enrich
raw_venues = search_venues("sushi", lat=35.6762, lng=139.6503, radius=3000)
print(f"Found {len(raw_venues)} sushi restaurants near Tokyo station")
detailed_venues = enrich_venues(raw_venues[:20])
print(f"Enriched {len(detailed_venues)} venues with full details")
Bulk City Scanning: Grid Strategy
The API returns a maximum of 50 results per query. To cover an entire city, split the area into a grid of small cells and query each one, deduplicating by fsq_id.
This is the key technique for building comprehensive city datasets.
def haversine_km(lat1: float, lng1: float, lat2: float, lng2: float) -> float:
"""Calculate distance in km between two lat/lng points."""
R = 6371.0
dlat = math.radians(lat2 - lat1)
dlng = math.radians(lng2 - lng1)
a = (math.sin(dlat/2)**2 +
math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) * math.sin(dlng/2)**2)
return 2 * R * math.asin(math.sqrt(a))
def degrees_per_km(lat: float) -> tuple:
"""
Return (degrees_lat_per_km, degrees_lng_per_km) at given latitude.
Longitude degrees per km shrinks as you approach the poles.
"""
lat_deg = 1 / 110.574
lng_deg = 1 / (111.320 * math.cos(math.radians(lat)))
return lat_deg, lng_deg
def generate_grid(
lat_min: float,
lat_max: float,
lng_min: float,
lng_max: float,
cell_radius_m: int = 800,
) -> list:
"""
Generate a grid of center points covering a bounding box.
cell_radius_m controls overlap (smaller = denser, more API calls, fewer missed venues).
"""
center_lat = (lat_min + lat_max) / 2
step_km = (cell_radius_m * 2) / 1000
lat_step, lng_step = degrees_per_km(center_lat)
lat_step *= step_km
lng_step *= step_km
lats = []
lat = lat_min
while lat <= lat_max:
lats.append(round(lat, 6))
lat += lat_step
lngs = []
lng = lng_min
while lng <= lng_max:
lngs.append(round(lng, 6))
lng += lng_step
return list(cartesian_product(lats, lngs))
def scan_city(
query: str,
lat_min: float,
lat_max: float,
lng_min: float,
lng_max: float,
cell_radius_m: int = 800,
request_delay: float = 0.4,
max_errors: int = 10,
) -> list:
"""
Scan a city bounding box for all venues matching a query.
Deduplicates by fsq_id.
Args:
query: Search string (e.g. "restaurant", "coffee")
lat_min, lat_max, lng_min, lng_max: Bounding box coordinates
cell_radius_m: Radius of each grid cell in meters
request_delay: Seconds between API calls
max_errors: Stop scanning if this many consecutive errors occur
Returns:
List of flat venue dicts ready for storage
"""
grid = generate_grid(lat_min, lat_max, lng_min, lng_max, cell_radius_m)
seen_ids: set = set()
all_venues: list = []
consecutive_errors = 0
print(f"Scanning {len(grid)} grid cells with radius={cell_radius_m}m...")
for i, (lat, lng) in enumerate(grid):
try:
results = search_venues(
query=query,
lat=lat,
lng=lng,
radius=cell_radius_m,
)
new_venues = []
for v in results:
fid = v.get("fsq_id")
if fid and fid not in seen_ids:
seen_ids.add(fid)
new_venues.append(parse_venue(v))
all_venues.extend(new_venues)
consecutive_errors = 0
if new_venues:
print(f" Cell {i+1}/{len(grid)} ({lat:.4f},{lng:.4f}): +{len(new_venues)} new (total: {len(all_venues)})")
except httpx.HTTPStatusError as e:
consecutive_errors += 1
if e.response.status_code == 429:
print(f" Rate limited at cell {i+1}. Waiting 60s...")
time.sleep(60)
else:
print(f" HTTP {e.response.status_code} at cell {i+1}")
except Exception as ex:
consecutive_errors += 1
print(f" Error at cell {i+1}: {ex}")
if consecutive_errors >= max_errors:
print(f"Too many consecutive errors ({max_errors}). Stopping scan.")
break
time.sleep(request_delay)
print(f"Scan complete: {len(all_venues)} unique venues found")
return all_venues
# Example: scan for restaurants in Manhattan
manhattan_restaurants = scan_city(
query="restaurant",
lat_min=40.700,
lat_max=40.780,
lng_min=-74.020,
lng_max=-73.940,
cell_radius_m=700,
)
print(f"Manhattan restaurants: {len(manhattan_restaurants)}")
Category-Based Scanning
For comprehensive data by venue type, use Foursquare's category IDs instead of text queries. This avoids keyword matching issues:
# Key Foursquare category IDs (as of 2026)
CATEGORIES = {
"coffee_shops": "13032",
"restaurants": "13065",
"fast_food": "13145",
"bars": "13003",
"nightclubs": "10032",
"hotels": "19014",
"gyms": "18021",
"supermarkets": "17069",
"hospitals": "15014",
"schools": "12058",
"museums": "10027",
"parks": "16032",
"shopping_malls": "17114",
"banks": "11010",
"cinemas": "10024",
"bookstores": "17022",
}
def scan_city_by_category(
category_id: str,
lat_min: float,
lat_max: float,
lng_min: float,
lng_max: float,
cell_radius_m: int = 800,
request_delay: float = 0.4,
) -> list:
"""Scan a city for all venues in a specific category."""
grid = generate_grid(lat_min, lat_max, lng_min, lng_max, cell_radius_m)
seen_ids: set = set()
all_venues: list = []
print(f"Scanning {len(grid)} cells for category {category_id}...")
for i, (lat, lng) in enumerate(grid):
try:
results = search_venues(
lat=lat,
lng=lng,
radius=cell_radius_m,
categories=category_id,
limit=50,
)
for v in results:
fid = v.get("fsq_id")
if fid and fid not in seen_ids:
seen_ids.add(fid)
all_venues.append(parse_venue(v))
except httpx.HTTPStatusError as e:
if e.response.status_code == 429:
print(f"Rate limited at cell {i+1}. Waiting 60s...")
time.sleep(60)
time.sleep(request_delay)
return all_venues
# Get all coffee shops in central London
coffee_shops = scan_city_by_category(
CATEGORIES["coffee_shops"],
lat_min=51.48, lat_max=51.54,
lng_min=-0.18, lng_max=-0.02,
)
print(f"Coffee shops in central London: {len(coffee_shops)}")
Rate Limits and Free Tier Details
Foursquare's free tier is generous for most use cases:
| Limit | Free Plan |
|---|---|
| API calls per month | 200,000 |
| Requests per minute | ~100 |
| Results per search query | 50 max |
| Results per nearby query | 50 max |
For a grid scan of a medium-sized city (say 500 grid cells), you will use roughly 500-1000 API calls — well within the free tier for a one-off collection.
Monitoring Your Usage
def check_api_quota() -> dict:
"""Check current API quota usage."""
# Check headers on a lightweight request
last_response = client.get(
"/places/search",
params={"query": "test", "ll": "40.7,-74.0", "limit": 1}
)
headers = dict(last_response.headers)
return {
"rate_limit_remaining": headers.get("x-ratelimit-remaining"),
"rate_limit_reset": headers.get("x-ratelimit-reset"),
"rate_limit_limit": headers.get("x-ratelimit-limit"),
}
quota = check_api_quota()
print(f"API quota: {quota}")
Web Scraping to Supplement API Data
The Foursquare API does not expose everything. The website has: - Full review text (not just tips) - Photo galleries per venue - Historical popularity trends - User check-in counts visible on venue pages - "Best nearby" editorial content
For this supplementary data, you scrape the venue web pages. Since rate limits are per API key (not per IP) for the API, you only need proxies for web scraping.
ThorData's rotating residential proxies are ideal here — their residential IP pool ensures each request comes from a legitimate household IP, which avoids the IP-based blocks Foursquare deploys on the website (separate from the API).
# ThorData proxy configuration
# Sign up at: https://thordata.partnerstack.com/partner/0a0x4nzh
THORDATA_USER = "YOUR_USERNAME"
THORDATA_PASS = "YOUR_PASSWORD"
THORDATA_PROXY = f"http://{THORDATA_USER}:{THORDATA_PASS}@gate.thordata.net:7777"
# Rotating browser User-Agent headers to avoid fingerprinting
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0",
]
scrape_client = httpx.Client(
proxies={
"http://": THORDATA_PROXY,
"https://": THORDATA_PROXY,
},
headers={
"User-Agent": random.choice(USER_AGENTS),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"DNT": "1",
},
timeout=30,
follow_redirects=True,
)
def scrape_venue_web_page(fsq_id: str) -> dict:
"""
Scrape additional data from a Foursquare venue web page.
Supplements API data with web-only content.
"""
url = f"https://foursquare.com/v/{fsq_id}"
# Rotate user agent on each request
scrape_client.headers["User-Agent"] = random.choice(USER_AGENTS)
try:
r = scrape_client.get(url)
r.raise_for_status()
except httpx.HTTPStatusError as e:
print(f"HTTP {e.response.status_code} for {fsq_id}")
return {}
except Exception as e:
print(f"Scrape error for {fsq_id}: {e}")
return {}
soup = BeautifulSoup(r.text, "html.parser")
data = {}
# Structured data (JSON-LD)
structured = soup.find("script", type="application/ld+json")
if structured:
try:
ld = json.loads(structured.string)
data["aggregate_rating"] = ld.get("aggregateRating", {})
data["review_count"] = ld.get("aggregateRating", {}).get("reviewCount")
except Exception:
pass
# Editorial tips/reviews
tips = []
for tip_el in soup.select(".tip, .review-text, [data-testid=tip-text]")[:10]:
text = tip_el.get_text(strip=True)
if len(text) > 20:
tips.append(text)
data["scraped_tips"] = tips
# Photo count
photo_count_el = soup.select_one(".photo-count, [data-testid=photo-count]")
if photo_count_el:
data["web_photo_count"] = photo_count_el.get_text(strip=True)
return data
def enrich_with_web_data(venues: list, delay: float = 2.0) -> list:
"""Enrich venue dicts with data scraped from web pages."""
for i, venue in enumerate(venues):
fsq_id = venue.get("fsq_id")
if not fsq_id:
continue
web_data = scrape_venue_web_page(fsq_id)
if web_data:
venue.update({f"web_{k}": v for k, v in web_data.items()})
if (i + 1) % 10 == 0:
print(f"Web-enriched {i + 1}/{len(venues)} venues")
# Randomize delay to mimic human behavior
time.sleep(random.uniform(delay * 0.7, delay * 1.3))
return venues
Sticky Sessions for Multi-Page Scraping
When scraping multiple pages for the same venue (photos, reviews), use ThorData's sticky session feature to keep the same IP for the duration:
import uuid
def build_thordata_sticky_client(
session_minutes: int = 10,
country: str = "US",
) -> httpx.Client:
"""
Build an httpx client with a ThorData sticky session proxy.
The same IP will be used for session_minutes minutes.
Sticky sessions prevent Foursquare from seeing IP jumps between
pages of the same venue, which can trigger bot detection.
Sign up: https://thordata.partnerstack.com/partner/0a0x4nzh
"""
session_id = str(uuid.uuid4())[:8]
proxy_user = f"{THORDATA_USER}-session-{session_id}-country-{country}"
proxy_url = f"http://{proxy_user}:{THORDATA_PASS}@gate.thordata.net:7777"
return httpx.Client(
proxies={"http://": proxy_url, "https://": proxy_url},
headers={
"User-Agent": random.choice(USER_AGENTS),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
},
timeout=30,
follow_redirects=True,
)
# Use sticky session for scraping multiple pages of the same venue
with build_thordata_sticky_client(country="GB") as sticky_client:
venue_ids = ["4b058804f964a520c4f722e3", "4b058804f964a520c4f722e4"]
for fsq_id in venue_ids:
# Main venue page
r = sticky_client.get(f"https://foursquare.com/v/{fsq_id}")
print(f"Main page: {r.status_code}")
time.sleep(1.5)
# Photos page
r2 = sticky_client.get(f"https://foursquare.com/v/{fsq_id}/photos")
print(f"Photos: {r2.status_code}")
time.sleep(random.uniform(1, 3))
SQLite Database for Venue Storage
Persist your venue collection in SQLite for queries and deduplication:
def init_venues_db(path: str = "venues.db") -> sqlite3.Connection:
"""Initialize venue database with all relevant tables."""
conn = sqlite3.connect(path)
conn.row_factory = sqlite3.Row
conn.executescript("""
CREATE TABLE IF NOT EXISTS venues (
fsq_id TEXT PRIMARY KEY,
name TEXT NOT NULL,
address TEXT,
city TEXT,
state TEXT,
country TEXT,
postcode TEXT,
lat REAL,
lng REAL,
neighborhood TEXT,
categories TEXT,
category_ids TEXT,
main_category_icon TEXT,
chain TEXT,
rating REAL,
price INTEGER,
popularity REAL,
website TEXT,
phone TEXT,
twitter TEXT,
description TEXT,
top_tip TEXT,
tip_count INTEGER DEFAULT 0,
first_photo_url TEXT,
total_checkins INTEGER DEFAULT 0,
total_tips INTEGER DEFAULT 0,
total_ratings INTEGER DEFAULT 0,
total_photos INTEGER DEFAULT 0,
is_open_now INTEGER,
hours_display TEXT,
has_wifi INTEGER,
wheelchair_accessible INTEGER,
outdoor_seating INTEGER,
verified INTEGER DEFAULT 0,
store_id TEXT,
collected_at TEXT
);
CREATE INDEX IF NOT EXISTS idx_venues_city ON venues(city);
CREATE INDEX IF NOT EXISTS idx_venues_category ON venues(categories);
CREATE INDEX IF NOT EXISTS idx_venues_rating ON venues(rating DESC);
CREATE INDEX IF NOT EXISTS idx_venues_lat_lng ON venues(lat, lng);
CREATE TABLE IF NOT EXISTS collection_log (
id INTEGER PRIMARY KEY AUTOINCREMENT,
query TEXT,
city TEXT,
lat_min REAL, lat_max REAL, lng_min REAL, lng_max REAL,
venues_found INTEGER,
started_at TEXT,
completed_at TEXT
);
""")
conn.commit()
return conn
def upsert_venue(conn: sqlite3.Connection, venue: dict):
"""Insert or update a venue record in the database."""
fields = list(venue.keys())
placeholders = ", ".join(["?"] * len(fields))
updates = ", ".join([f"{f} = excluded.{f}" for f in fields if f != "fsq_id"])
sql = f"""
INSERT INTO venues ({", ".join(fields)})
VALUES ({placeholders})
ON CONFLICT(fsq_id) DO UPDATE SET {updates}
"""
conn.execute(sql, list(venue.values()))
def bulk_upsert_venues(conn: sqlite3.Connection, venues: list):
"""Batch upsert a list of venues."""
for venue in venues:
upsert_venue(conn, venue)
conn.commit()
print(f"Saved {len(venues)} venues to database")
def query_venues_nearby(conn: sqlite3.Connection, lat: float, lng: float, radius_km: float = 1.0):
"""
Query venues within a radius using bounding box approximation.
Fast for large datasets without spatial index extensions.
"""
# Rough bounding box
lat_delta = radius_km / 111.0
lng_delta = radius_km / (111.0 * math.cos(math.radians(lat)))
rows = conn.execute("""
SELECT * FROM venues
WHERE lat BETWEEN ? AND ?
AND lng BETWEEN ? AND ?
AND rating IS NOT NULL
ORDER BY rating DESC
""", (
lat - lat_delta, lat + lat_delta,
lng - lng_delta, lng + lng_delta,
)).fetchall()
return [dict(r) for r in rows]
Exporting Data
def export_to_csv(venues: list, output_path: str = "venues.csv"):
"""Export venue list to CSV file."""
if not venues:
print("No venues to export")
return
output = Path(output_path)
fieldnames = list(venues[0].keys())
with open(output, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(venues)
size_kb = output.stat().st_size / 1024
print(f"Exported {len(venues)} venues to {output} ({size_kb:.1f} KB)")
def export_to_geojson(venues: list, output_path: str = "venues.geojson"):
"""
Export venue list to GeoJSON format for mapping tools.
Compatible with QGIS, Mapbox, Leaflet, Kepler.gl, etc.
"""
features = []
for v in venues:
if v.get("lat") and v.get("lng"):
feature = {
"type": "Feature",
"geometry": {
"type": "Point",
"coordinates": [v["lng"], v["lat"]],
},
"properties": {
k: val for k, val in v.items()
if k not in ("lat", "lng")
},
}
features.append(feature)
geojson = {
"type": "FeatureCollection",
"features": features,
"metadata": {
"count": len(features),
"exported_at": datetime.now(timezone.utc).isoformat(),
}
}
with open(output_path, "w", encoding="utf-8") as f:
json.dump(geojson, f, indent=2)
print(f"Exported {len(features)} venues to {output_path} (GeoJSON)")
def export_to_jsonl(venues: list, output_path: str = "venues.jsonl"):
"""Export venues to JSON Lines format (one JSON object per line)."""
with open(output_path, "w", encoding="utf-8") as f:
for v in venues:
f.write(json.dumps(v) + "\n")
print(f"Exported {len(venues)} venues to {output_path}")
Real-World Use Cases
Use Case 1: Restaurant Recommendation Engine
Build a ranked list of restaurants in any area for a recommendation feature:
def build_restaurant_rankings(
lat: float,
lng: float,
radius_m: int = 2000,
category: str = "13065",
) -> list:
"""
Build a ranked restaurant list for a location.
Combines rating, popularity, and tip count into a composite score.
Returns:
List of restaurants sorted by composite score
"""
venues = search_venues(lat=lat, lng=lng, radius=radius_m, categories=category)
parsed = [parse_venue(v) for v in venues]
# Composite score: weighted combination of quality signals
def score_venue(v):
rating = v.get("rating") or 5.0 # 0-10 scale
popularity = v.get("popularity") or 0.3 # 0-1 scale
tips = min(v.get("tip_count", 0), 50) # cap at 50
checkins = min(v.get("total_checkins", 0), 10000)
score = (
(rating / 10) * 0.5 + # 50% weight on rating
popularity * 0.3 + # 30% weight on popularity
(tips / 50) * 0.1 + # 10% weight on tip count
(checkins / 10000) * 0.1 # 10% weight on engagement
)
return score
ranked = sorted(parsed, key=score_venue, reverse=True)
print(f"Top restaurants near ({lat:.4f}, {lng:.4f}):")
for r in ranked[:10]:
print(f" {r['name']:30s} | Rating: {r['rating'] or 'N/A'}/10 | {r['address'][:40]}")
return ranked
# Best restaurants near Times Square, NYC
nyc_restaurants = build_restaurant_rankings(lat=40.7580, lng=-73.9855, radius_m=1500)
Use Case 2: Market Research — Coffee Shop Density Map
Map all coffee shops in a city to identify oversaturated vs underserved areas:
def coffee_density_analysis(
city_name: str,
lat_min: float, lat_max: float,
lng_min: float, lng_max: float,
cell_size_km: float = 1.0,
) -> dict:
"""
Analyze coffee shop density across a city grid.
Identifies high-density (saturated) and low-density (opportunity) zones.
Returns:
Dict with density stats and top/bottom areas
"""
# Collect all coffee shops
venues = scan_city_by_category(
CATEGORIES["coffee_shops"],
lat_min, lat_max, lng_min, lng_max,
cell_radius_m=int(cell_size_km * 500),
)
if not venues:
return {}
# Build density grid
grid_size = 0.02 # ~2km grid cells
density_map = {}
for v in venues:
if not v.get("lat") or not v.get("lng"):
continue
# Snap to grid
cell_lat = round(v["lat"] / grid_size) * grid_size
cell_lng = round(v["lng"] / grid_size) * grid_size
cell = (round(cell_lat, 4), round(cell_lng, 4))
density_map[cell] = density_map.get(cell, 0) + 1
# Identify hotspots and gaps
sorted_cells = sorted(density_map.items(), key=lambda x: -x[1])
stats = {
"city": city_name,
"total_venues": len(venues),
"grid_cells_with_venues": len(density_map),
"max_density": sorted_cells[0][1] if sorted_cells else 0,
"avg_density": sum(density_map.values()) / len(density_map) if density_map else 0,
"hotspots": sorted_cells[:5],
"top_rated": sorted(
[v for v in venues if v.get("rating")],
key=lambda x: -(x["rating"] or 0)
)[:10],
}
print(f"\n=== Coffee density in {city_name} ===")
print(f"Total coffee shops: {stats['total_venues']}")
print(f"Avg per grid cell: {stats['avg_density']:.1f}")
print(f"\nHotspot areas (lat, lng): count")
for cell, count in stats["hotspots"]:
print(f" {cell}: {count} shops")
return stats
# Analyze London coffee scene
london_coffee = coffee_density_analysis(
"London",
lat_min=51.47, lat_max=51.54,
lng_min=-0.18, lng_max=-0.02,
)
Use Case 3: Travel App — Top-Rated Attractions by City
Fetch top-rated venues across categories to power a travel recommendations feature:
TRAVEL_CATEGORIES = {
"museums": "10027",
"parks": "16032",
"attractions": "10000",
"restaurants": "13065",
"coffee": "13032",
"nightlife": "10032",
"shopping": "17000",
"hotels": "19014",
}
def get_travel_recommendations(
lat: float,
lng: float,
radius_m: int = 5000,
min_rating: float = 7.5,
) -> dict:
"""
Build a travel recommendations dict for a location.
Groups top venues by category, filtered by rating.
Args:
lat, lng: City center coordinates
radius_m: Search radius in meters
min_rating: Minimum rating to include (0-10 scale)
Returns:
Dict mapping category name to list of top venues
"""
recommendations = {}
for cat_name, cat_id in TRAVEL_CATEGORIES.items():
print(f"Fetching {cat_name}...")
venues = search_venues(
lat=lat,
lng=lng,
radius=radius_m,
categories=cat_id,
limit=20,
)
parsed = [parse_venue(v) for v in venues]
# Filter by rating and sort
high_rated = [
v for v in parsed
if v.get("rating") and v["rating"] >= min_rating
]
high_rated.sort(key=lambda x: -(x["rating"] or 0))
recommendations[cat_name] = high_rated[:5]
print(f" Found {len(high_rated)} {cat_name} with rating >= {min_rating}")
time.sleep(0.5)
return recommendations
# Get travel recs for Amsterdam city center
amsterdam_recs = get_travel_recommendations(
lat=52.3676,
lng=4.9041,
radius_m=3000,
min_rating=8.0,
)
for category, venues in amsterdam_recs.items():
print(f"\nTop {category} in Amsterdam:")
for v in venues:
print(f" {v['name']:35s} | {v['rating']}/10 | {v['address'][:50]}")
Use Case 4: Location Intelligence for Retail Site Selection
Analyze competitor density and foot traffic signals to help choose retail locations:
def retail_site_analysis(
candidate_lat: float,
candidate_lng: float,
competitor_category_id: str,
complement_category_ids: list,
radius_m: int = 500,
) -> dict:
"""
Analyze a candidate retail location using surrounding venue data.
Looks at:
- Competitor count (same category) in radius
- Complementary venue count (foot traffic generators)
- Average competitor ratings (quality of competition)
- Area popularity signals
Args:
candidate_lat, candidate_lng: Location to analyze
competitor_category_id: Category ID of your business type
complement_category_ids: Categories that generate foot traffic (e.g. offices, transit)
radius_m: Analysis radius in meters
Returns:
Analysis dict with site quality score
"""
# Count competitors
competitors = search_venues(
lat=candidate_lat,
lng=candidate_lng,
radius=radius_m,
categories=competitor_category_id,
)
comp_parsed = [parse_venue(v) for v in competitors]
avg_comp_rating = (
sum(v["rating"] for v in comp_parsed if v.get("rating")) /
max(1, sum(1 for v in comp_parsed if v.get("rating")))
)
# Count foot traffic generators
foot_traffic_total = 0
for cat_id in complement_category_ids:
ft_venues = search_venues(
lat=candidate_lat,
lng=candidate_lng,
radius=radius_m,
categories=cat_id,
)
foot_traffic_total += len(ft_venues)
time.sleep(0.3)
# Score the site
competition_score = max(0, 10 - len(competitors)) # fewer = better
foot_traffic_score = min(10, foot_traffic_total)
quality_opportunity = max(0, avg_comp_rating - 5) if comp_parsed else 5
site_score = (
competition_score * 0.4 +
foot_traffic_score * 0.4 +
quality_opportunity * 0.2
)
analysis = {
"location": (candidate_lat, candidate_lng),
"competitor_count": len(competitors),
"avg_competitor_rating": round(avg_comp_rating, 1),
"foot_traffic_venue_count": foot_traffic_total,
"site_score": round(site_score, 1),
"recommendation": (
"strong" if site_score >= 7 else
"moderate" if site_score >= 5 else
"weak"
),
"top_competitors": [
{"name": v["name"], "rating": v["rating"], "address": v["address"]}
for v in sorted(comp_parsed, key=lambda x: -(x["rating"] or 0))[:5]
],
}
return analysis
# Analyze a potential coffee shop location
site = retail_site_analysis(
candidate_lat=51.5194,
candidate_lng=-0.1270,
competitor_category_id=CATEGORIES["coffee_shops"],
complement_category_ids=["17114", "11000"], # shopping malls, offices
radius_m=400,
)
print(f"\nSite analysis: {site['location']}")
print(f"Competitors within 400m: {site['competitor_count']}")
print(f"Avg competitor rating: {site['avg_competitor_rating']}/10")
print(f"Foot traffic venues: {site['foot_traffic_venue_count']}")
print(f"Site score: {site['site_score']}/10 ({site['recommendation']})")
Full Collection Pipeline
Here is a complete end-to-end pipeline that scans a city, stores results in SQLite, and exports to multiple formats:
def run_city_collection(
city_name: str,
queries: list,
bounding_box: tuple,
output_db: str = "venues.db",
cell_radius_m: int = 700,
):
"""
Full city data collection pipeline.
Args:
city_name: Name for logging
queries: List of search queries or category IDs to collect
bounding_box: (lat_min, lat_max, lng_min, lng_max) tuple
output_db: SQLite database path
cell_radius_m: Grid cell radius in meters
Returns:
Collection stats dict
"""
lat_min, lat_max, lng_min, lng_max = bounding_box
conn = init_venues_db(output_db)
started_at = datetime.now(timezone.utc).isoformat()
total_venues = 0
stats = {}
print(f"\n=== Collecting venues for {city_name} ===")
print(f"Bounding box: ({lat_min}, {lat_max}, {lng_min}, {lng_max})")
print(f"Cell radius: {cell_radius_m}m")
for query in queries:
print(f"\nQuery: {query}")
venues = scan_city(
query=query,
lat_min=lat_min, lat_max=lat_max,
lng_min=lng_min, lng_max=lng_max,
cell_radius_m=cell_radius_m,
)
bulk_upsert_venues(conn, venues)
stats[query] = len(venues)
total_venues += len(venues)
# Export
all_venues = [
dict(row) for row in conn.execute(
"SELECT * FROM venues WHERE city = ? OR city IS NULL ORDER BY rating DESC",
(city_name,)
).fetchall()
]
if all_venues:
export_to_csv(all_venues, f"{city_name.lower().replace(' ', '_')}_venues.csv")
export_to_geojson(all_venues, f"{city_name.lower().replace(' ', '_')}_venues.geojson")
completed_at = datetime.now(timezone.utc).isoformat()
# Log the collection
conn.execute("""
INSERT INTO collection_log (query, city, lat_min, lat_max, lng_min, lng_max, venues_found, started_at, completed_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (
", ".join(queries), city_name,
lat_min, lat_max, lng_min, lng_max,
total_venues, started_at, completed_at,
))
conn.commit()
print(f"\n=== Collection complete: {city_name} ===")
print(f"Total unique venues: {total_venues}")
print(f"By query: {stats}")
return stats
# Collect venues for Berlin
run_city_collection(
city_name="Berlin",
queries=["restaurant", "coffee", "bar", "hotel", "museum"],
bounding_box=(52.45, 52.56, 13.30, 13.50),
output_db="berlin_venues.db",
cell_radius_m=700,
)
Summary
The Foursquare Places API v3 is the best freely available source for structured, global venue data. The key techniques covered here:
- Grid scanning is mandatory for comprehensive city coverage — the 50-result cap per query means you must tile the area
- Category IDs give cleaner results than text queries for systematic collection
- Rate limits are generous on the free tier (200K/month) but you still need retry logic and respectful delays
- Web scraping supplements API data for content not exposed via API (tips, photos, historical popularity)
- Residential proxies from ThorData are needed for web scraping to avoid IP blocks on the Foursquare website
- SQLite with a proper schema handles deduplication and makes the dataset queryable without spinning up a full database server
- GeoJSON export makes the data immediately usable in mapping and visualization tools
For a one-time city dataset, the free tier is more than sufficient. For continuous or multi-city collection, budget for the paid API plan (~$200/month) or structure your collection to spread API usage across the monthly window.