How to Scrape Flickr Photo Data with Python (2026)
How to Scrape Flickr Photo Data with Python (2026)
Flickr hosts over 10 billion photos, many with detailed EXIF metadata, geolocation tags, Creative Commons licensing, and rich user-generated annotations. For anyone building image datasets, studying photography trends, collecting geo-tagged visual data, or archiving creative work, Flickr is hard to beat.
The Flickr API is mature, well-documented, and generous with access. A free API key gives you access to nearly everything — photo search, user galleries, group pools, EXIF data, tag clouds, and per-photo metadata. This guide covers effective use of the API with Python, including the limitations you'll actually hit at scale and how to work around them.
What the Flickr API Exposes
The API surface is extensive. Core data available:
- Photo metadata — title, description, tags, date taken, date uploaded, license type, safety level, views, comments, faves
- EXIF data — camera make/model, aperture, shutter speed, ISO, focal length, lens model, GPS coordinates (when present in EXIF)
- User profiles — display name, real name (if public), location, join date, total photos, follower/following counts
- Groups — title, description, member count, topic tags, pool photos with contributor info
- Galleries — curated collections by any user, with ordering and descriptions
- Commons — Flickr Commons subset of museum/archive content with special licensing
- Geo-tagged content — photos with geographic coordinates, searchable by bounding box or radial distance
- Tag statistics — hot tags, related tags, cluster views
- Machine tags — structured namespace:predicate=value tags used by power users and applications
API Key Setup
Get a free API key at flickr.com/services/apps/create. Select "non-commercial" for personal/research use. You'll receive an API key and secret immediately — no approval queue.
The API supports both JSON and XML responses. JSON with nojsoncallback=1 is the clean path.
import httpx
import time
import random
from typing import Optional
API_KEY = "your_flickr_api_key"
BASE_URL = "https://api.flickr.com/services/rest/"
# Simple rate limit tracking
_last_request_time = 0
_min_interval = 1.0 # 1 second between requests = 3600/hr, well under limit
def flickr_request(
method: str,
params: dict = None,
timeout: float = 30.0,
) -> dict:
"""Make a rate-limited request to the Flickr API."""
global _last_request_time
# Enforce minimum interval
elapsed = time.time() - _last_request_time
if elapsed < _min_interval:
time.sleep(_min_interval - elapsed)
request_params = {
"method": method,
"api_key": API_KEY,
"format": "json",
"nojsoncallback": 1,
}
if params:
request_params.update(params)
response = httpx.get(BASE_URL, params=request_params, timeout=timeout)
response.raise_for_status()
_last_request_time = time.time()
data = response.json()
# Flickr wraps errors in a 200 response with stat: "fail"
if data.get("stat") == "fail":
code = data.get("code")
message = data.get("message", "Unknown Flickr error")
raise FlickrAPIError(code, message)
return data
class FlickrAPIError(Exception):
def __init__(self, code: int, message: str):
self.code = code
self.message = message
super().__init__(f"Flickr API error {code}: {message}")
Searching Photos with Full Extras
The flickr.photos.search method is extremely flexible — filter by text, tags, location, date, license, camera, user, and more:
# License IDs: 1=CC BY-NC-SA, 2=CC BY-NC, 3=CC BY-NC-ND, 4=CC BY,
# 5=CC BY-SA, 6=CC BY-ND, 9=CC0, 10=PDM
CREATIVE_COMMONS_LICENSES = "1,2,3,4,5,6,9,10"
STANDARD_EXTRAS = (
"url_m,url_l,url_o,date_taken,date_upload,owner_name,"
"tags,machine_tags,geo,views,media,original_format,"
"last_update,license,count_faves,count_comments"
)
def search_photos(
text: str = None,
tags: str = None,
user_id: str = None,
license_ids: str = None,
has_geo: bool = False,
min_taken_date: str = None, # YYYY-MM-DD or Unix timestamp
max_taken_date: str = None,
sort: str = "relevance",
per_page: int = 100,
page: int = 1,
extras: str = None,
safe_search: int = 1,
) -> dict:
"""
Search Flickr photos with comprehensive filtering.
sort options: date-posted-asc, date-posted-desc, date-taken-asc,
date-taken-desc, interestingness-desc, relevance
"""
params = {
"per_page": min(per_page, 500),
"page": page,
"sort": sort,
"safe_search": safe_search,
"extras": extras or STANDARD_EXTRAS,
}
if text:
params["text"] = text
if tags:
params["tags"] = tags
params["tag_mode"] = "all"
if user_id:
params["user_id"] = user_id
if license_ids:
params["license"] = license_ids
if has_geo:
params["has_geo"] = 1
if min_taken_date:
params["min_taken_date"] = min_taken_date
if max_taken_date:
params["max_taken_date"] = max_taken_date
data = flickr_request("flickr.photos.search", params)
photos_data = data["photos"]
photos = []
for p in photos_data["photo"]:
photos.append({
"id": p["id"],
"title": p.get("title"),
"owner_id": p["owner"],
"owner_name": p.get("ownername"),
"date_taken": p.get("datetaken"),
"date_uploaded": p.get("dateupload"),
"last_update": p.get("lastupdate"),
"tags": p.get("tags", "").split() if p.get("tags") else [],
"machine_tags": p.get("machine_tags", ""),
"views": int(p.get("views", 0)),
"faves": int(p.get("count_faves", 0)),
"comments": int(p.get("count_comments", 0)),
"media_type": p.get("media"),
"latitude": float(p["latitude"]) if p.get("latitude") and p["latitude"] != "0" else None,
"longitude": float(p["longitude"]) if p.get("longitude") and p["longitude"] != "0" else None,
"license_id": p.get("license"),
"url_medium": p.get("url_m"),
"url_large": p.get("url_l"),
"url_original": p.get("url_o"),
"original_format": p.get("originalformat"),
})
return {
"photos": photos,
"total": int(photos_data["total"]),
"pages": int(photos_data["pages"]),
"page": int(photos_data["page"]),
"per_page": int(photos_data["perpage"]),
}
def search_all_pages(text: str, max_photos: int = 4000, **kwargs) -> list[dict]:
"""
Collect all results for a search query, respecting the 4000-result API cap.
For broader collection, split into date range windows.
"""
all_photos = []
page = 1
while len(all_photos) < max_photos:
result = search_photos(text=text, page=page, **kwargs)
batch = result["photos"]
if not batch:
break
all_photos.extend(batch)
print(f" Page {page}/{result['pages']}: {len(all_photos)}/{result['total']} photos")
if page >= result["pages"] or page >= 40: # Hard cap at page 40 = 4000 results
break
page += 1
time.sleep(random.uniform(0.8, 1.5))
return all_photos[:max_photos]
Bypassing the 4000-Result Limit with Date Windows
The search API caps results at 4000 (page 40 × 100 per page) regardless of how many total matches exist. For tags or subjects with millions of photos, split queries by date range:
from datetime import datetime, timedelta
def search_with_date_windows(
text: str,
start_date: str,
end_date: str,
window_days: int = 30,
**kwargs,
) -> list[dict]:
"""
Search across a large date range by splitting into windows.
Avoids the 4000-result cap per query by narrowing scope.
"""
all_photos = []
current = datetime.strptime(start_date, "%Y-%m-%d")
end = datetime.strptime(end_date, "%Y-%m-%d")
while current < end:
window_end = min(current + timedelta(days=window_days), end)
print(f" Window: {current.strftime('%Y-%m-%d')} to {window_end.strftime('%Y-%m-%d')}")
batch = search_all_pages(
text=text,
min_taken_date=current.strftime("%Y-%m-%d"),
max_taken_date=window_end.strftime("%Y-%m-%d"),
**kwargs,
)
all_photos.extend(batch)
print(f" Got {len(batch)} photos (total: {len(all_photos)})")
current = window_end + timedelta(days=1)
time.sleep(2.0) # Pause between windows
return all_photos
Extracting EXIF Data
Flickr stores camera settings for millions of photos. This is valuable for photography analysis, camera market research, or training ML models where camera metadata is needed:
def get_photo_exif(photo_id: str) -> dict:
"""Get EXIF metadata for a specific photo."""
try:
data = flickr_request("flickr.photos.getExif", {"photo_id": photo_id})
except FlickrAPIError as e:
if e.code == 2: # Photo not found
return {}
if e.code == 100: # No EXIF (owner disabled or no EXIF in original)
return {}
raise
if data.get("stat") != "ok":
return {}
photo_meta = data["photo"]
exif_raw = photo_meta.get("exif", [])
# Build tag -> value map
exif_map = {}
for entry in exif_raw:
tag = entry["label"]
raw_val = entry.get("raw", {}).get("_content", "")
clean_val = entry.get("clean", {}).get("_content", raw_val)
exif_map[tag] = clean_val
return {
"camera": photo_meta.get("camera"),
"make": exif_map.get("Make"),
"model": exif_map.get("Model"),
"lens": exif_map.get("Lens Model") or exif_map.get("Lens") or exif_map.get("LensModel"),
"focal_length": exif_map.get("Focal Length"),
"focal_length_equiv": exif_map.get("Focal Length 35mm Equivalent"),
"aperture": exif_map.get("Aperture"),
"shutter_speed": exif_map.get("Exposure") or exif_map.get("Shutter Speed"),
"iso": exif_map.get("ISO Speed") or exif_map.get("ISO Speed Ratings"),
"flash": exif_map.get("Flash"),
"white_balance": exif_map.get("White Balance"),
"exposure_mode": exif_map.get("Exposure Mode"),
"metering_mode": exif_map.get("Metering Mode"),
"software": exif_map.get("Software"),
"gps_latitude": exif_map.get("GPS Latitude"),
"gps_longitude": exif_map.get("GPS Longitude"),
"gps_altitude": exif_map.get("GPS Altitude"),
"color_space": exif_map.get("Color Space"),
}
def get_photo_info(photo_id: str, secret: str = None) -> dict:
"""Get full metadata for a single photo."""
params = {"photo_id": photo_id}
if secret:
params["secret"] = secret
data = flickr_request("flickr.photos.getInfo", params)
photo = data["photo"]
return {
"id": photo["id"],
"secret": photo["secret"],
"server": photo["server"],
"farm": photo["farm"],
"title": photo["title"]["_content"],
"description": photo["description"]["_content"],
"owner_id": photo["owner"]["nsid"],
"owner_name": photo["owner"]["username"],
"owner_realname": photo["owner"].get("realname"),
"date_taken": photo["dates"]["taken"],
"date_posted": photo["dates"]["posted"],
"date_updated": photo["dates"].get("lastupdate"),
"views": int(photo.get("views", 0)),
"tags": [t["raw"] for t in photo["tags"]["tag"]],
"machine_tags": [t["raw"] for t in photo["tags"]["tag"] if t.get("machine_tag") == "1"],
"license_id": photo["license"],
"safety_level": photo.get("safety_level"),
"url": photo["urls"]["url"][0]["_content"],
"media": photo.get("media"),
"original_format": photo.get("originalformat"),
"location": photo.get("location"),
"geoperms": photo.get("geoperms"),
}
Geo-Tagged Photo Collection
Flickr's geo search API allows bounding box queries — useful for building location-tagged image datasets:
def search_by_location(
lat: float,
lon: float,
radius_km: float = 5.0,
min_taken_date: str = None,
license_ids: str = None,
per_page: int = 100,
page: int = 1,
) -> dict:
"""Search photos near a geographic point."""
params = {
"lat": lat,
"lon": lon,
"radius": radius_km,
"radius_units": "km",
"has_geo": 1,
"per_page": per_page,
"page": page,
"extras": "geo,url_m,url_l,date_taken,tags,views,license",
"safe_search": 1,
}
if min_taken_date:
params["min_taken_date"] = min_taken_date
if license_ids:
params["license"] = license_ids
data = flickr_request("flickr.photos.search", params)
photos_data = data["photos"]
return {
"photos": [
{
"id": p["id"],
"title": p.get("title"),
"owner_id": p["owner"],
"lat": float(p.get("latitude", 0) or 0),
"lon": float(p.get("longitude", 0) or 0),
"date_taken": p.get("datetaken"),
"tags": p.get("tags", "").split(),
"views": int(p.get("views", 0)),
"license_id": p.get("license"),
"url_medium": p.get("url_m"),
"url_large": p.get("url_l"),
}
for p in photos_data.get("photo", [])
],
"total": int(photos_data.get("total", 0)),
"pages": int(photos_data.get("pages", 0)),
}
def search_by_bbox(
min_lat: float, min_lon: float,
max_lat: float, max_lon: float,
per_page: int = 100,
**kwargs,
) -> list[dict]:
"""
Search photos within a bounding box.
bbox format: min_lon,min_lat,max_lon,max_lat (Flickr convention)
"""
params = {
"bbox": f"{min_lon},{min_lat},{max_lon},{max_lat}",
"has_geo": 1,
"per_page": per_page,
"extras": "geo,url_m,url_l,date_taken,views,license",
}
params.update(kwargs)
data = flickr_request("flickr.photos.search", params)
return data["photos"].get("photo", [])
Group Pools
Groups on Flickr are curated collections, often with specific photographic themes (wildlife, street photography, specific camera models). Scraping group pools gives you topical image sets with curator curation:
def get_group_info(group_id: str = None, group_path_alias: str = None) -> dict:
"""Get metadata for a Flickr group."""
params = {}
if group_id:
params["group_id"] = group_id
elif group_path_alias:
params["group_path_alias"] = group_path_alias
data = flickr_request("flickr.groups.getInfo", params)
group = data["group"]
return {
"id": group["id"],
"name": group["name"]["_content"],
"description": group.get("description", {}).get("_content", ""),
"members": int(group.get("members", {}).get("_content", 0)),
"pool_count": int(group.get("pool_count", {}).get("_content", 0)),
"topic_count": int(group.get("topic_count", {}).get("_content", 0)),
}
def get_group_photos(
group_id: str,
per_page: int = 100,
page: int = 1,
extras: str = None,
) -> dict:
"""Get photos from a Flickr group pool."""
data = flickr_request("flickr.groups.pools.getPhotos", {
"group_id": group_id,
"per_page": per_page,
"page": page,
"extras": extras or "url_m,url_l,date_taken,owner_name,tags,views,license",
})
photos_data = data["photos"]
return {
"photos": [
{
"id": p["id"],
"title": p.get("title"),
"owner_id": p["owner"],
"owner_name": p.get("ownername"),
"views": int(p.get("views", 0)),
"license_id": p.get("license"),
"url_medium": p.get("url_m"),
"url_large": p.get("url_l"),
}
for p in photos_data.get("photo", [])
],
"total": int(photos_data.get("total", 0)),
"pages": int(photos_data.get("pages", 0)),
}
def scrape_group_all_photos(
group_id: str,
max_pages: int = 40,
) -> list[dict]:
"""Collect all photos from a group pool (up to 4000)."""
all_photos = []
for page in range(1, max_pages + 1):
result = get_group_photos(group_id, page=page)
batch = result["photos"]
if not batch:
break
all_photos.extend(batch)
print(f" Group pool page {page}/{result['pages']}: {len(all_photos)} total")
if page >= result["pages"]:
break
time.sleep(random.uniform(0.8, 1.5))
return all_photos
User Gallery Scraping
def get_user_photos(
user_id: str,
per_page: int = 100,
page: int = 1,
min_taken_date: str = None,
extras: str = None,
) -> dict:
"""Get a user's public photos."""
params = {
"user_id": user_id,
"per_page": per_page,
"page": page,
"extras": extras or "url_m,url_l,date_taken,views,tags,license,original_format",
}
if min_taken_date:
params["min_taken_date"] = min_taken_date
data = flickr_request("flickr.people.getPublicPhotos", params)
photos_data = data["photos"]
return {
"photos": [
{
"id": p["id"],
"title": p.get("title"),
"date_taken": p.get("datetaken"),
"views": int(p.get("views", 0)),
"license_id": p.get("license"),
"url_medium": p.get("url_m"),
"url_large": p.get("url_l"),
"original_format": p.get("originalformat"),
}
for p in photos_data.get("photo", [])
],
"total": int(photos_data.get("total", 0)),
"pages": int(photos_data.get("pages", 0)),
}
def find_user_by_email(email: str) -> dict:
"""Look up a Flickr user by their email address (requires auth)."""
data = flickr_request("flickr.people.findByEmail", {"find_email": email})
return data.get("user", {})
def get_user_profile(user_id: str) -> dict:
"""Get public profile info for a Flickr user."""
data = flickr_request("flickr.people.getInfo", {"user_id": user_id})
person = data["person"]
return {
"id": person["id"],
"nsid": person["nsid"],
"username": person["username"]["_content"],
"realname": person.get("realname", {}).get("_content", ""),
"location": person.get("location", {}).get("_content", ""),
"description": person.get("description", {}).get("_content", ""),
"photos_count": int(person.get("photos", {}).get("count", {}).get("_content", 0)),
"joined": person.get("photos", {}).get("firstdate", {}).get("_content"),
"profile_url": person["profileurl"]["_content"],
"is_pro": person.get("ispro", 0) == 1,
}
Anti-Bot Measures and Rate Limits
Flickr is API-friendly, but large-scale collection hits specific walls:
Hourly API rate limit. Flickr allows 3,600 API calls per hour per key — exactly 1 per second average. Burst above that and requests return error code 18 (Rate limit exceeded). The _last_request_time tracking in the base client handles this automatically.
Search result cap. The photos.search endpoint returns at most 4,000 results per query (page 40 × 100). Use date-window splitting as shown above for high-volume subjects.
EXIF availability. Photo owners can disable EXIF visibility, and phone uploads often strip EXIF before upload. Expect full camera EXIF on roughly 40-60% of DSLR shots, much less for smartphone photos.
CDN throttling on image downloads. While API calls are rate-limited per key, downloading the actual image files (the url_l, url_o URLs) is throttled per IP by Flickr's static CDN. Fetching images rapidly from one IP triggers 429s and temporary soft blocks.
Multiple API keys in parallel. For higher throughput than 1 req/sec, the Flickr API terms technically prohibit using multiple keys from the same application — but many researchers use separate keys for separate research tasks. Each key gets its own 3,600 req/hour bucket.
For bulk image downloads at scale — building datasets with hundreds of thousands of images — you need to distribute the download requests across multiple IPs. ThorData's rotating residential proxies are effective for this because Flickr's CDN applies standard browser-like rate limits to residential IPs rather than the aggressive throttling it applies to recognized datacenter ranges:
import httpx
import random
THORDATA_USER = "your_user"
THORDATA_PASS = "your_pass"
THORDATA_HOST = "proxy.thordata.com"
THORDATA_PORT = 9000
def make_download_client(session_id: str = None) -> httpx.Client:
"""Create an httpx client routed through ThorData for image downloads."""
if session_id:
proxy_user = f"{THORDATA_USER}-session-{session_id}"
else:
proxy_user = THORDATA_USER
proxy_url = f"http://{proxy_user}:{THORDATA_PASS}@{THORDATA_HOST}:{THORDATA_PORT}"
return httpx.Client(
proxies={"https://": proxy_url, "http://": proxy_url},
timeout=30.0,
follow_redirects=True,
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Referer": "https://www.flickr.com/",
},
)
def download_photos_batch(
photos: list[dict],
output_dir: str = "flickr_downloads",
prefer_size: str = "large",
max_workers: int = 4,
) -> dict:
"""
Download a batch of photos with parallel workers, each using a different proxy session.
Returns {photo_id: local_path} mapping.
"""
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
os.makedirs(output_dir, exist_ok=True)
results = {}
def download_one(photo: dict, worker_id: int) -> tuple:
photo_id = photo["id"]
url = photo.get(f"url_{prefer_size}") or photo.get("url_medium") or photo.get("url_large")
if not url:
return photo_id, None
ext = photo.get("original_format", "jpg")
filename = f"{photo_id}.{ext}"
filepath = os.path.join(output_dir, filename)
if os.path.exists(filepath):
return photo_id, filepath # Already downloaded
client = make_download_client(session_id=f"worker-{worker_id}")
try:
resp = client.get(url)
if resp.status_code == 200:
with open(filepath, "wb") as f:
f.write(resp.content)
return photo_id, filepath
except Exception as e:
print(f" Failed {photo_id}: {e}")
finally:
client.close()
return photo_id, None
with ThreadPoolExecutor(max_workers=max_workers) as pool:
futures = {
pool.submit(download_one, photo, i % max_workers): photo["id"]
for i, photo in enumerate(photos)
}
for future in as_completed(futures):
photo_id, path = future.result()
results[photo_id] = path
time.sleep(random.uniform(0.2, 0.5))
success = sum(1 for p in results.values() if p)
print(f"Downloaded {success}/{len(photos)} photos to {output_dir}")
return results
Storing Photo Metadata
For building persistent datasets:
import sqlite3
import json
def init_flickr_db(path: str = "flickr_photos.db") -> sqlite3.Connection:
conn = sqlite3.connect(path)
conn.execute("""
CREATE TABLE IF NOT EXISTS photos (
id TEXT PRIMARY KEY,
owner_id TEXT,
owner_name TEXT,
title TEXT,
description TEXT,
date_taken TEXT,
date_uploaded TEXT,
tags TEXT,
views INTEGER,
faves INTEGER,
license_id TEXT,
latitude REAL,
longitude REAL,
url_medium TEXT,
url_large TEXT,
url_original TEXT,
original_format TEXT,
local_path TEXT,
collected_at TEXT DEFAULT (datetime('now'))
)
""")
conn.execute("""
CREATE TABLE IF NOT EXISTS photo_exif (
photo_id TEXT PRIMARY KEY,
camera TEXT,
make TEXT,
model TEXT,
lens TEXT,
focal_length TEXT,
aperture TEXT,
shutter_speed TEXT,
iso TEXT,
flash TEXT,
gps_latitude TEXT,
gps_longitude TEXT,
collected_at TEXT DEFAULT (datetime('now')),
FOREIGN KEY (photo_id) REFERENCES photos(id)
)
""")
conn.execute("""
CREATE INDEX IF NOT EXISTS idx_photos_owner ON photos(owner_id);
CREATE INDEX IF NOT EXISTS idx_photos_date ON photos(date_taken);
CREATE INDEX IF NOT EXISTS idx_photos_license ON photos(license_id);
""")
conn.commit()
return conn
def save_photos(conn: sqlite3.Connection, photos: list[dict]):
conn.executemany("""
INSERT OR IGNORE INTO photos
(id, owner_id, owner_name, title, date_taken, date_uploaded,
tags, views, faves, license_id, latitude, longitude,
url_medium, url_large, url_original, original_format)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""", [
(
p["id"], p.get("owner_id"), p.get("owner_name"), p.get("title"),
p.get("date_taken"), p.get("date_uploaded"),
json.dumps(p.get("tags", [])),
p.get("views", 0), p.get("faves", 0), p.get("license_id"),
p.get("latitude"), p.get("longitude"),
p.get("url_medium"), p.get("url_large"), p.get("url_original"),
p.get("original_format"),
)
for p in photos
])
conn.commit()
def save_exif(conn: sqlite3.Connection, photo_id: str, exif: dict):
if not exif:
return
conn.execute("""
INSERT OR REPLACE INTO photo_exif
(photo_id, camera, make, model, lens, focal_length, aperture,
shutter_speed, iso, flash, gps_latitude, gps_longitude)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (
photo_id, exif.get("camera"), exif.get("make"), exif.get("model"),
exif.get("lens"), exif.get("focal_length"), exif.get("aperture"),
exif.get("shutter_speed"), exif.get("iso"), exif.get("flash"),
exif.get("gps_latitude"), exif.get("gps_longitude"),
))
conn.commit()
Practical Dataset Building
A complete workflow for building a tagged image dataset:
def build_dataset(
subject: str,
license_ids: str = CREATIVE_COMMONS_LICENSES,
start_date: str = "2020-01-01",
end_date: str = "2026-01-01",
target_size: int = 10000,
download_images: bool = True,
output_dir: str = "dataset",
):
"""Build a labeled image dataset from Flickr for a given subject."""
conn = init_flickr_db(f"{output_dir}/metadata.db")
print(f"Searching Flickr for '{subject}'...")
photos = search_with_date_windows(
text=subject,
start_date=start_date,
end_date=end_date,
license_ids=license_ids,
has_geo=False,
)
print(f"Found {len(photos)} photos. Saving metadata...")
save_photos(conn, photos)
if download_images:
print(f"Downloading images to {output_dir}/images/...")
download_photos_batch(
photos[:target_size],
output_dir=f"{output_dir}/images",
max_workers=4,
)
print(f"Dataset complete: {len(photos)} metadata records, up to {target_size} images")
conn.close()
# Example: wildlife photography dataset with GPS coordinates
# build_dataset("wildlife photography", has_geo=True, target_size=5000)
Legal and Ethical Considerations
Flickr's API terms permit non-commercial research and personal use. The key restriction is that you must attribute photographers when publishing or redistributing images (even CC-licensed ones require attribution). For published datasets:
- Always include the photo ID, owner ID, and license ID in your metadata
- Link back to the original photo URL so viewers can find the source
- For CC-BY and stricter licenses, display photographer attribution in any public-facing use
- Respect photographer requests to remove their work even when licensing technically permits inclusion
- Don't use the API to scrape user personal information for commercial outreach
The Creative Commons license filtering (license=1,2,3,4,5,6,9,10) is the most important tool for building legally usable datasets. The difference between license=4 (CC BY) and no license filter is enormous in terms of what you can do with the resulting dataset.