← Back to blog

How to Scrape Flickr Photo Data with Python (2026)

How to Scrape Flickr Photo Data with Python (2026)

Flickr hosts over 10 billion photos, many with detailed EXIF metadata, geolocation tags, Creative Commons licensing, and rich user-generated annotations. For anyone building image datasets, studying photography trends, collecting geo-tagged visual data, or archiving creative work, Flickr is hard to beat.

The Flickr API is mature, well-documented, and generous with access. A free API key gives you access to nearly everything — photo search, user galleries, group pools, EXIF data, tag clouds, and per-photo metadata. This guide covers effective use of the API with Python, including the limitations you'll actually hit at scale and how to work around them.

What the Flickr API Exposes

The API surface is extensive. Core data available:

API Key Setup

Get a free API key at flickr.com/services/apps/create. Select "non-commercial" for personal/research use. You'll receive an API key and secret immediately — no approval queue.

The API supports both JSON and XML responses. JSON with nojsoncallback=1 is the clean path.

import httpx
import time
import random
from typing import Optional

API_KEY = "your_flickr_api_key"
BASE_URL = "https://api.flickr.com/services/rest/"

# Simple rate limit tracking
_last_request_time = 0
_min_interval = 1.0  # 1 second between requests = 3600/hr, well under limit


def flickr_request(
    method: str,
    params: dict = None,
    timeout: float = 30.0,
) -> dict:
    """Make a rate-limited request to the Flickr API."""
    global _last_request_time

    # Enforce minimum interval
    elapsed = time.time() - _last_request_time
    if elapsed < _min_interval:
        time.sleep(_min_interval - elapsed)

    request_params = {
        "method": method,
        "api_key": API_KEY,
        "format": "json",
        "nojsoncallback": 1,
    }
    if params:
        request_params.update(params)

    response = httpx.get(BASE_URL, params=request_params, timeout=timeout)
    response.raise_for_status()
    _last_request_time = time.time()

    data = response.json()

    # Flickr wraps errors in a 200 response with stat: "fail"
    if data.get("stat") == "fail":
        code = data.get("code")
        message = data.get("message", "Unknown Flickr error")
        raise FlickrAPIError(code, message)

    return data


class FlickrAPIError(Exception):
    def __init__(self, code: int, message: str):
        self.code = code
        self.message = message
        super().__init__(f"Flickr API error {code}: {message}")

Searching Photos with Full Extras

The flickr.photos.search method is extremely flexible — filter by text, tags, location, date, license, camera, user, and more:

# License IDs: 1=CC BY-NC-SA, 2=CC BY-NC, 3=CC BY-NC-ND, 4=CC BY,
#              5=CC BY-SA, 6=CC BY-ND, 9=CC0, 10=PDM
CREATIVE_COMMONS_LICENSES = "1,2,3,4,5,6,9,10"

STANDARD_EXTRAS = (
    "url_m,url_l,url_o,date_taken,date_upload,owner_name,"
    "tags,machine_tags,geo,views,media,original_format,"
    "last_update,license,count_faves,count_comments"
)


def search_photos(
    text: str = None,
    tags: str = None,
    user_id: str = None,
    license_ids: str = None,
    has_geo: bool = False,
    min_taken_date: str = None,  # YYYY-MM-DD or Unix timestamp
    max_taken_date: str = None,
    sort: str = "relevance",
    per_page: int = 100,
    page: int = 1,
    extras: str = None,
    safe_search: int = 1,
) -> dict:
    """
    Search Flickr photos with comprehensive filtering.

    sort options: date-posted-asc, date-posted-desc, date-taken-asc,
                  date-taken-desc, interestingness-desc, relevance
    """
    params = {
        "per_page": min(per_page, 500),
        "page": page,
        "sort": sort,
        "safe_search": safe_search,
        "extras": extras or STANDARD_EXTRAS,
    }

    if text:
        params["text"] = text
    if tags:
        params["tags"] = tags
        params["tag_mode"] = "all"
    if user_id:
        params["user_id"] = user_id
    if license_ids:
        params["license"] = license_ids
    if has_geo:
        params["has_geo"] = 1
    if min_taken_date:
        params["min_taken_date"] = min_taken_date
    if max_taken_date:
        params["max_taken_date"] = max_taken_date

    data = flickr_request("flickr.photos.search", params)
    photos_data = data["photos"]

    photos = []
    for p in photos_data["photo"]:
        photos.append({
            "id": p["id"],
            "title": p.get("title"),
            "owner_id": p["owner"],
            "owner_name": p.get("ownername"),
            "date_taken": p.get("datetaken"),
            "date_uploaded": p.get("dateupload"),
            "last_update": p.get("lastupdate"),
            "tags": p.get("tags", "").split() if p.get("tags") else [],
            "machine_tags": p.get("machine_tags", ""),
            "views": int(p.get("views", 0)),
            "faves": int(p.get("count_faves", 0)),
            "comments": int(p.get("count_comments", 0)),
            "media_type": p.get("media"),
            "latitude": float(p["latitude"]) if p.get("latitude") and p["latitude"] != "0" else None,
            "longitude": float(p["longitude"]) if p.get("longitude") and p["longitude"] != "0" else None,
            "license_id": p.get("license"),
            "url_medium": p.get("url_m"),
            "url_large": p.get("url_l"),
            "url_original": p.get("url_o"),
            "original_format": p.get("originalformat"),
        })

    return {
        "photos": photos,
        "total": int(photos_data["total"]),
        "pages": int(photos_data["pages"]),
        "page": int(photos_data["page"]),
        "per_page": int(photos_data["perpage"]),
    }


def search_all_pages(text: str, max_photos: int = 4000, **kwargs) -> list[dict]:
    """
    Collect all results for a search query, respecting the 4000-result API cap.
    For broader collection, split into date range windows.
    """
    all_photos = []
    page = 1

    while len(all_photos) < max_photos:
        result = search_photos(text=text, page=page, **kwargs)
        batch = result["photos"]

        if not batch:
            break

        all_photos.extend(batch)
        print(f"  Page {page}/{result['pages']}: {len(all_photos)}/{result['total']} photos")

        if page >= result["pages"] or page >= 40:  # Hard cap at page 40 = 4000 results
            break

        page += 1
        time.sleep(random.uniform(0.8, 1.5))

    return all_photos[:max_photos]

Bypassing the 4000-Result Limit with Date Windows

The search API caps results at 4000 (page 40 × 100 per page) regardless of how many total matches exist. For tags or subjects with millions of photos, split queries by date range:

from datetime import datetime, timedelta


def search_with_date_windows(
    text: str,
    start_date: str,
    end_date: str,
    window_days: int = 30,
    **kwargs,
) -> list[dict]:
    """
    Search across a large date range by splitting into windows.
    Avoids the 4000-result cap per query by narrowing scope.
    """
    all_photos = []
    current = datetime.strptime(start_date, "%Y-%m-%d")
    end = datetime.strptime(end_date, "%Y-%m-%d")

    while current < end:
        window_end = min(current + timedelta(days=window_days), end)

        print(f"  Window: {current.strftime('%Y-%m-%d')} to {window_end.strftime('%Y-%m-%d')}")
        batch = search_all_pages(
            text=text,
            min_taken_date=current.strftime("%Y-%m-%d"),
            max_taken_date=window_end.strftime("%Y-%m-%d"),
            **kwargs,
        )
        all_photos.extend(batch)
        print(f"    Got {len(batch)} photos (total: {len(all_photos)})")

        current = window_end + timedelta(days=1)
        time.sleep(2.0)  # Pause between windows

    return all_photos

Extracting EXIF Data

Flickr stores camera settings for millions of photos. This is valuable for photography analysis, camera market research, or training ML models where camera metadata is needed:

def get_photo_exif(photo_id: str) -> dict:
    """Get EXIF metadata for a specific photo."""
    try:
        data = flickr_request("flickr.photos.getExif", {"photo_id": photo_id})
    except FlickrAPIError as e:
        if e.code == 2:  # Photo not found
            return {}
        if e.code == 100:  # No EXIF (owner disabled or no EXIF in original)
            return {}
        raise

    if data.get("stat") != "ok":
        return {}

    photo_meta = data["photo"]
    exif_raw = photo_meta.get("exif", [])

    # Build tag -> value map
    exif_map = {}
    for entry in exif_raw:
        tag = entry["label"]
        raw_val = entry.get("raw", {}).get("_content", "")
        clean_val = entry.get("clean", {}).get("_content", raw_val)
        exif_map[tag] = clean_val

    return {
        "camera": photo_meta.get("camera"),
        "make": exif_map.get("Make"),
        "model": exif_map.get("Model"),
        "lens": exif_map.get("Lens Model") or exif_map.get("Lens") or exif_map.get("LensModel"),
        "focal_length": exif_map.get("Focal Length"),
        "focal_length_equiv": exif_map.get("Focal Length 35mm Equivalent"),
        "aperture": exif_map.get("Aperture"),
        "shutter_speed": exif_map.get("Exposure") or exif_map.get("Shutter Speed"),
        "iso": exif_map.get("ISO Speed") or exif_map.get("ISO Speed Ratings"),
        "flash": exif_map.get("Flash"),
        "white_balance": exif_map.get("White Balance"),
        "exposure_mode": exif_map.get("Exposure Mode"),
        "metering_mode": exif_map.get("Metering Mode"),
        "software": exif_map.get("Software"),
        "gps_latitude": exif_map.get("GPS Latitude"),
        "gps_longitude": exif_map.get("GPS Longitude"),
        "gps_altitude": exif_map.get("GPS Altitude"),
        "color_space": exif_map.get("Color Space"),
    }


def get_photo_info(photo_id: str, secret: str = None) -> dict:
    """Get full metadata for a single photo."""
    params = {"photo_id": photo_id}
    if secret:
        params["secret"] = secret

    data = flickr_request("flickr.photos.getInfo", params)
    photo = data["photo"]

    return {
        "id": photo["id"],
        "secret": photo["secret"],
        "server": photo["server"],
        "farm": photo["farm"],
        "title": photo["title"]["_content"],
        "description": photo["description"]["_content"],
        "owner_id": photo["owner"]["nsid"],
        "owner_name": photo["owner"]["username"],
        "owner_realname": photo["owner"].get("realname"),
        "date_taken": photo["dates"]["taken"],
        "date_posted": photo["dates"]["posted"],
        "date_updated": photo["dates"].get("lastupdate"),
        "views": int(photo.get("views", 0)),
        "tags": [t["raw"] for t in photo["tags"]["tag"]],
        "machine_tags": [t["raw"] for t in photo["tags"]["tag"] if t.get("machine_tag") == "1"],
        "license_id": photo["license"],
        "safety_level": photo.get("safety_level"),
        "url": photo["urls"]["url"][0]["_content"],
        "media": photo.get("media"),
        "original_format": photo.get("originalformat"),
        "location": photo.get("location"),
        "geoperms": photo.get("geoperms"),
    }

Geo-Tagged Photo Collection

Flickr's geo search API allows bounding box queries — useful for building location-tagged image datasets:

def search_by_location(
    lat: float,
    lon: float,
    radius_km: float = 5.0,
    min_taken_date: str = None,
    license_ids: str = None,
    per_page: int = 100,
    page: int = 1,
) -> dict:
    """Search photos near a geographic point."""
    params = {
        "lat": lat,
        "lon": lon,
        "radius": radius_km,
        "radius_units": "km",
        "has_geo": 1,
        "per_page": per_page,
        "page": page,
        "extras": "geo,url_m,url_l,date_taken,tags,views,license",
        "safe_search": 1,
    }

    if min_taken_date:
        params["min_taken_date"] = min_taken_date
    if license_ids:
        params["license"] = license_ids

    data = flickr_request("flickr.photos.search", params)
    photos_data = data["photos"]

    return {
        "photos": [
            {
                "id": p["id"],
                "title": p.get("title"),
                "owner_id": p["owner"],
                "lat": float(p.get("latitude", 0) or 0),
                "lon": float(p.get("longitude", 0) or 0),
                "date_taken": p.get("datetaken"),
                "tags": p.get("tags", "").split(),
                "views": int(p.get("views", 0)),
                "license_id": p.get("license"),
                "url_medium": p.get("url_m"),
                "url_large": p.get("url_l"),
            }
            for p in photos_data.get("photo", [])
        ],
        "total": int(photos_data.get("total", 0)),
        "pages": int(photos_data.get("pages", 0)),
    }


def search_by_bbox(
    min_lat: float, min_lon: float,
    max_lat: float, max_lon: float,
    per_page: int = 100,
    **kwargs,
) -> list[dict]:
    """
    Search photos within a bounding box.
    bbox format: min_lon,min_lat,max_lon,max_lat (Flickr convention)
    """
    params = {
        "bbox": f"{min_lon},{min_lat},{max_lon},{max_lat}",
        "has_geo": 1,
        "per_page": per_page,
        "extras": "geo,url_m,url_l,date_taken,views,license",
    }
    params.update(kwargs)

    data = flickr_request("flickr.photos.search", params)
    return data["photos"].get("photo", [])

Group Pools

Groups on Flickr are curated collections, often with specific photographic themes (wildlife, street photography, specific camera models). Scraping group pools gives you topical image sets with curator curation:

def get_group_info(group_id: str = None, group_path_alias: str = None) -> dict:
    """Get metadata for a Flickr group."""
    params = {}
    if group_id:
        params["group_id"] = group_id
    elif group_path_alias:
        params["group_path_alias"] = group_path_alias

    data = flickr_request("flickr.groups.getInfo", params)
    group = data["group"]

    return {
        "id": group["id"],
        "name": group["name"]["_content"],
        "description": group.get("description", {}).get("_content", ""),
        "members": int(group.get("members", {}).get("_content", 0)),
        "pool_count": int(group.get("pool_count", {}).get("_content", 0)),
        "topic_count": int(group.get("topic_count", {}).get("_content", 0)),
    }


def get_group_photos(
    group_id: str,
    per_page: int = 100,
    page: int = 1,
    extras: str = None,
) -> dict:
    """Get photos from a Flickr group pool."""
    data = flickr_request("flickr.groups.pools.getPhotos", {
        "group_id": group_id,
        "per_page": per_page,
        "page": page,
        "extras": extras or "url_m,url_l,date_taken,owner_name,tags,views,license",
    })
    photos_data = data["photos"]

    return {
        "photos": [
            {
                "id": p["id"],
                "title": p.get("title"),
                "owner_id": p["owner"],
                "owner_name": p.get("ownername"),
                "views": int(p.get("views", 0)),
                "license_id": p.get("license"),
                "url_medium": p.get("url_m"),
                "url_large": p.get("url_l"),
            }
            for p in photos_data.get("photo", [])
        ],
        "total": int(photos_data.get("total", 0)),
        "pages": int(photos_data.get("pages", 0)),
    }


def scrape_group_all_photos(
    group_id: str,
    max_pages: int = 40,
) -> list[dict]:
    """Collect all photos from a group pool (up to 4000)."""
    all_photos = []

    for page in range(1, max_pages + 1):
        result = get_group_photos(group_id, page=page)
        batch = result["photos"]

        if not batch:
            break

        all_photos.extend(batch)
        print(f"  Group pool page {page}/{result['pages']}: {len(all_photos)} total")

        if page >= result["pages"]:
            break

        time.sleep(random.uniform(0.8, 1.5))

    return all_photos
def get_user_photos(
    user_id: str,
    per_page: int = 100,
    page: int = 1,
    min_taken_date: str = None,
    extras: str = None,
) -> dict:
    """Get a user's public photos."""
    params = {
        "user_id": user_id,
        "per_page": per_page,
        "page": page,
        "extras": extras or "url_m,url_l,date_taken,views,tags,license,original_format",
    }
    if min_taken_date:
        params["min_taken_date"] = min_taken_date

    data = flickr_request("flickr.people.getPublicPhotos", params)
    photos_data = data["photos"]

    return {
        "photos": [
            {
                "id": p["id"],
                "title": p.get("title"),
                "date_taken": p.get("datetaken"),
                "views": int(p.get("views", 0)),
                "license_id": p.get("license"),
                "url_medium": p.get("url_m"),
                "url_large": p.get("url_l"),
                "original_format": p.get("originalformat"),
            }
            for p in photos_data.get("photo", [])
        ],
        "total": int(photos_data.get("total", 0)),
        "pages": int(photos_data.get("pages", 0)),
    }


def find_user_by_email(email: str) -> dict:
    """Look up a Flickr user by their email address (requires auth)."""
    data = flickr_request("flickr.people.findByEmail", {"find_email": email})
    return data.get("user", {})


def get_user_profile(user_id: str) -> dict:
    """Get public profile info for a Flickr user."""
    data = flickr_request("flickr.people.getInfo", {"user_id": user_id})
    person = data["person"]

    return {
        "id": person["id"],
        "nsid": person["nsid"],
        "username": person["username"]["_content"],
        "realname": person.get("realname", {}).get("_content", ""),
        "location": person.get("location", {}).get("_content", ""),
        "description": person.get("description", {}).get("_content", ""),
        "photos_count": int(person.get("photos", {}).get("count", {}).get("_content", 0)),
        "joined": person.get("photos", {}).get("firstdate", {}).get("_content"),
        "profile_url": person["profileurl"]["_content"],
        "is_pro": person.get("ispro", 0) == 1,
    }

Anti-Bot Measures and Rate Limits

Flickr is API-friendly, but large-scale collection hits specific walls:

Hourly API rate limit. Flickr allows 3,600 API calls per hour per key — exactly 1 per second average. Burst above that and requests return error code 18 (Rate limit exceeded). The _last_request_time tracking in the base client handles this automatically.

Search result cap. The photos.search endpoint returns at most 4,000 results per query (page 40 × 100). Use date-window splitting as shown above for high-volume subjects.

EXIF availability. Photo owners can disable EXIF visibility, and phone uploads often strip EXIF before upload. Expect full camera EXIF on roughly 40-60% of DSLR shots, much less for smartphone photos.

CDN throttling on image downloads. While API calls are rate-limited per key, downloading the actual image files (the url_l, url_o URLs) is throttled per IP by Flickr's static CDN. Fetching images rapidly from one IP triggers 429s and temporary soft blocks.

Multiple API keys in parallel. For higher throughput than 1 req/sec, the Flickr API terms technically prohibit using multiple keys from the same application — but many researchers use separate keys for separate research tasks. Each key gets its own 3,600 req/hour bucket.

For bulk image downloads at scale — building datasets with hundreds of thousands of images — you need to distribute the download requests across multiple IPs. ThorData's rotating residential proxies are effective for this because Flickr's CDN applies standard browser-like rate limits to residential IPs rather than the aggressive throttling it applies to recognized datacenter ranges:

import httpx
import random

THORDATA_USER = "your_user"
THORDATA_PASS = "your_pass"
THORDATA_HOST = "proxy.thordata.com"
THORDATA_PORT = 9000


def make_download_client(session_id: str = None) -> httpx.Client:
    """Create an httpx client routed through ThorData for image downloads."""
    if session_id:
        proxy_user = f"{THORDATA_USER}-session-{session_id}"
    else:
        proxy_user = THORDATA_USER

    proxy_url = f"http://{proxy_user}:{THORDATA_PASS}@{THORDATA_HOST}:{THORDATA_PORT}"

    return httpx.Client(
        proxies={"https://": proxy_url, "http://": proxy_url},
        timeout=30.0,
        follow_redirects=True,
        headers={
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
            "Referer": "https://www.flickr.com/",
        },
    )


def download_photos_batch(
    photos: list[dict],
    output_dir: str = "flickr_downloads",
    prefer_size: str = "large",
    max_workers: int = 4,
) -> dict:
    """
    Download a batch of photos with parallel workers, each using a different proxy session.
    Returns {photo_id: local_path} mapping.
    """
    import os
    from concurrent.futures import ThreadPoolExecutor, as_completed

    os.makedirs(output_dir, exist_ok=True)
    results = {}

    def download_one(photo: dict, worker_id: int) -> tuple:
        photo_id = photo["id"]
        url = photo.get(f"url_{prefer_size}") or photo.get("url_medium") or photo.get("url_large")
        if not url:
            return photo_id, None

        ext = photo.get("original_format", "jpg")
        filename = f"{photo_id}.{ext}"
        filepath = os.path.join(output_dir, filename)

        if os.path.exists(filepath):
            return photo_id, filepath  # Already downloaded

        client = make_download_client(session_id=f"worker-{worker_id}")
        try:
            resp = client.get(url)
            if resp.status_code == 200:
                with open(filepath, "wb") as f:
                    f.write(resp.content)
                return photo_id, filepath
        except Exception as e:
            print(f"  Failed {photo_id}: {e}")
        finally:
            client.close()

        return photo_id, None

    with ThreadPoolExecutor(max_workers=max_workers) as pool:
        futures = {
            pool.submit(download_one, photo, i % max_workers): photo["id"]
            for i, photo in enumerate(photos)
        }
        for future in as_completed(futures):
            photo_id, path = future.result()
            results[photo_id] = path
            time.sleep(random.uniform(0.2, 0.5))

    success = sum(1 for p in results.values() if p)
    print(f"Downloaded {success}/{len(photos)} photos to {output_dir}")
    return results

Storing Photo Metadata

For building persistent datasets:

import sqlite3
import json


def init_flickr_db(path: str = "flickr_photos.db") -> sqlite3.Connection:
    conn = sqlite3.connect(path)

    conn.execute("""
        CREATE TABLE IF NOT EXISTS photos (
            id              TEXT PRIMARY KEY,
            owner_id        TEXT,
            owner_name      TEXT,
            title           TEXT,
            description     TEXT,
            date_taken      TEXT,
            date_uploaded   TEXT,
            tags            TEXT,
            views           INTEGER,
            faves           INTEGER,
            license_id      TEXT,
            latitude        REAL,
            longitude       REAL,
            url_medium      TEXT,
            url_large       TEXT,
            url_original    TEXT,
            original_format TEXT,
            local_path      TEXT,
            collected_at    TEXT DEFAULT (datetime('now'))
        )
    """)

    conn.execute("""
        CREATE TABLE IF NOT EXISTS photo_exif (
            photo_id        TEXT PRIMARY KEY,
            camera          TEXT,
            make            TEXT,
            model           TEXT,
            lens            TEXT,
            focal_length    TEXT,
            aperture        TEXT,
            shutter_speed   TEXT,
            iso             TEXT,
            flash           TEXT,
            gps_latitude    TEXT,
            gps_longitude   TEXT,
            collected_at    TEXT DEFAULT (datetime('now')),
            FOREIGN KEY (photo_id) REFERENCES photos(id)
        )
    """)

    conn.execute("""
        CREATE INDEX IF NOT EXISTS idx_photos_owner ON photos(owner_id);
        CREATE INDEX IF NOT EXISTS idx_photos_date ON photos(date_taken);
        CREATE INDEX IF NOT EXISTS idx_photos_license ON photos(license_id);
    """)

    conn.commit()
    return conn


def save_photos(conn: sqlite3.Connection, photos: list[dict]):
    conn.executemany("""
        INSERT OR IGNORE INTO photos
        (id, owner_id, owner_name, title, date_taken, date_uploaded,
         tags, views, faves, license_id, latitude, longitude,
         url_medium, url_large, url_original, original_format)
        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
    """, [
        (
            p["id"], p.get("owner_id"), p.get("owner_name"), p.get("title"),
            p.get("date_taken"), p.get("date_uploaded"),
            json.dumps(p.get("tags", [])),
            p.get("views", 0), p.get("faves", 0), p.get("license_id"),
            p.get("latitude"), p.get("longitude"),
            p.get("url_medium"), p.get("url_large"), p.get("url_original"),
            p.get("original_format"),
        )
        for p in photos
    ])
    conn.commit()


def save_exif(conn: sqlite3.Connection, photo_id: str, exif: dict):
    if not exif:
        return
    conn.execute("""
        INSERT OR REPLACE INTO photo_exif
        (photo_id, camera, make, model, lens, focal_length, aperture,
         shutter_speed, iso, flash, gps_latitude, gps_longitude)
        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
    """, (
        photo_id, exif.get("camera"), exif.get("make"), exif.get("model"),
        exif.get("lens"), exif.get("focal_length"), exif.get("aperture"),
        exif.get("shutter_speed"), exif.get("iso"), exif.get("flash"),
        exif.get("gps_latitude"), exif.get("gps_longitude"),
    ))
    conn.commit()

Practical Dataset Building

A complete workflow for building a tagged image dataset:

def build_dataset(
    subject: str,
    license_ids: str = CREATIVE_COMMONS_LICENSES,
    start_date: str = "2020-01-01",
    end_date: str = "2026-01-01",
    target_size: int = 10000,
    download_images: bool = True,
    output_dir: str = "dataset",
):
    """Build a labeled image dataset from Flickr for a given subject."""
    conn = init_flickr_db(f"{output_dir}/metadata.db")

    print(f"Searching Flickr for '{subject}'...")
    photos = search_with_date_windows(
        text=subject,
        start_date=start_date,
        end_date=end_date,
        license_ids=license_ids,
        has_geo=False,
    )

    print(f"Found {len(photos)} photos. Saving metadata...")
    save_photos(conn, photos)

    if download_images:
        print(f"Downloading images to {output_dir}/images/...")
        download_photos_batch(
            photos[:target_size],
            output_dir=f"{output_dir}/images",
            max_workers=4,
        )

    print(f"Dataset complete: {len(photos)} metadata records, up to {target_size} images")
    conn.close()


# Example: wildlife photography dataset with GPS coordinates
# build_dataset("wildlife photography", has_geo=True, target_size=5000)

Flickr's API terms permit non-commercial research and personal use. The key restriction is that you must attribute photographers when publishing or redistributing images (even CC-licensed ones require attribution). For published datasets:

The Creative Commons license filtering (license=1,2,3,4,5,6,9,10) is the most important tool for building legally usable datasets. The difference between license=4 (CC BY) and no license filter is enormous in terms of what you can do with the resulting dataset.