How to Scrape Historical Weather Data in 2026: Open-Meteo, NOAA & Weather Underground

2026-04-09 ["weather data" "web scraping" "python" "open-meteo" "noaa"]

How to Scrape Historical Weather Data in 2026: Open-Meteo, NOAA & Weather Underground

Historical weather data powers everything from agricultural forecasting to insurance risk models, real estate analysis, and event planning tools. The challenge isn't that weather data is hidden -- it's scattered across dozens of sources with different formats, coverage gaps, and access methods.

This guide covers the three most practical sources: Open-Meteo (free API, global coverage), NOAA (the gold standard for US data), and Weather Underground (community station data you can't get elsewhere). We'll use Python for all three.

What Data Can You Collect?

Across these sources, you can get:

Daily observations -- temperature (min/max/avg), precipitation, humidity, wind speed, pressure
Hourly records -- temperature, dew point, wind direction, cloud cover, visibility
Historical ranges -- Open-Meteo goes back to 1940, NOAA to the 1800s for some stations
Station-level data -- specific weather station readings vs. interpolated grid data
Derived metrics -- growing degree days, heating/cooling degree days, precipitation accumulation
Climate normals -- 30-year averages for any location, useful as baselines for anomaly detection

Anti-Bot Measures and Rate Limits

Weather data sources vary widely in how they handle automated access:

Open-Meteo -- Generous for a free API. No API key needed. Rate limit is ~10,000 requests/day per IP. Exceeding it returns 429 with a retry-after header.
NOAA CDO API -- Requires a free token. Limit is 5 requests per second and 10,000 per day. Well-documented limits, clean error messages.
Weather Underground -- The hardest to scrape. Cloudflare protection, JavaScript rendering, aggressive bot detection. Datacenter IPs get blocked fast.
IP-based throttling -- All three sources throttle by IP. For bulk collection across thousands of cities, you'll hit limits quickly from a single IP.

For Weather Underground scraping and bulk Open-Meteo requests, rotating residential proxies make the difference between finishing your dataset and getting blocked halfway through. ThorData works well here -- their residential IPs don't trigger the bot detection that datacenter IPs do on Weather Underground, and the rotation keeps you under per-IP rate limits on Open-Meteo.

Open-Meteo: Free Historical Weather API

The simplest option. No API key, global coverage, data back to 1940.

pip install requests pandas

Fetching Historical Data

import requests
import pandas as pd
from datetime import datetime, date

def get_historical_weather(lat: float, lon: float, start: str, end: str) -> pd.DataFrame:
    \"\"\"Fetch daily historical weather from Open-Meteo.

    start/end format: 'YYYY-MM-DD'
    \"\"\"
    url = "https://archive-api.open-meteo.com/v1/archive"
    params = {
        "latitude": lat,
        "longitude": lon,
        "start_date": start,
        "end_date": end,
        "daily": ",".join([
            "temperature_2m_max",
            "temperature_2m_min",
            "temperature_2m_mean",
            "precipitation_sum",
            "rain_sum",
            "snowfall_sum",
            "windspeed_10m_max",
            "windgusts_10m_max",
            "relative_humidity_2m_mean",
            "pressure_msl_mean",
            "sunshine_duration",
            "et0_fao_evapotranspiration",
        ]),
        "timezone": "auto",
    }

    resp = requests.get(url, params=params, timeout=30)
    resp.raise_for_status()
    data = resp.json()

    daily = data["daily"]
    df = pd.DataFrame({
        "date": pd.to_datetime(daily["time"]),
        "temp_max_c": daily["temperature_2m_max"],
        "temp_min_c": daily["temperature_2m_min"],
        "temp_mean_c": daily["temperature_2m_mean"],
        "precipitation_mm": daily["precipitation_sum"],
        "rain_mm": daily["rain_sum"],
        "snowfall_cm": daily["snowfall_sum"],
        "wind_max_kmh": daily["windspeed_10m_max"],
        "wind_gusts_kmh": daily["windgusts_10m_max"],
        "humidity_pct": daily.get("relative_humidity_2m_mean"),
        "sunshine_hours": [s / 3600 if s else None for s in (daily.get("sunshine_duration") or [])],
    })

    return df


# Example: New York City, full year 2025
df = get_historical_weather(40.7128, -74.0060, "2025-01-01", "2025-12-31")
print(f"Records: {len(df)}")
print(f"Hottest day: {df.loc[df['temp_max_c'].idxmax(), 'date'].date()} ({df['temp_max_c'].max():.1f}C)")
print(f"Coldest day: {df.loc[df['temp_min_c'].idxmin(), 'date'].date()} ({df['temp_min_c'].min():.1f}C)")
print(f"Total rain: {df['rain_mm'].sum():.0f}mm")
print(f"Total snowfall: {df['snowfall_cm'].sum():.0f}cm")

Fetching Hourly Data

For hourly resolution, use the hourly parameter set:

def get_hourly_weather(lat: float, lon: float, start: str, end: str) -> pd.DataFrame:
    url = "https://archive-api.open-meteo.com/v1/archive"
    params = {
        "latitude": lat,
        "longitude": lon,
        "start_date": start,
        "end_date": end,
        "hourly": "temperature_2m,relative_humidity_2m,precipitation,rain,wind_speed_10m,wind_direction_10m,surface_pressure,cloud_cover,visibility",
        "timezone": "auto",
    }

    resp = requests.get(url, params=params, timeout=30)
    resp.raise_for_status()
    data = resp.json()

    hourly = data["hourly"]
    df = pd.DataFrame({
        "datetime": pd.to_datetime(hourly["time"]),
        "temp_c": hourly["temperature_2m"],
        "humidity_pct": hourly["relative_humidity_2m"],
        "precip_mm": hourly["precipitation"],
        "wind_speed_kmh": hourly["wind_speed_10m"],
        "wind_dir_deg": hourly["wind_direction_10m"],
        "pressure_hpa": hourly["surface_pressure"],
        "cloud_cover_pct": hourly["cloud_cover"],
    })
    return df

Bulk City Collection

import time

CITIES = {
    "New York": (40.7128, -74.0060),
    "London": (51.5074, -0.1278),
    "Tokyo": (35.6762, 139.6503),
    "Sydney": (-33.8688, 151.2093),
    "Sao Paulo": (-23.5505, -46.6333),
    "Dubai": (25.2048, 55.2708),
    "Toronto": (43.6532, -79.3832),
    "Berlin": (52.5200, 13.4050),
    "Singapore": (1.3521, 103.8198),
    "Mumbai": (19.0760, 72.8777),
    "Cairo": (30.0444, 31.2357),
    "Mexico City": (19.4326, -99.1332),
}

def collect_multi_city(cities: dict, start: str, end: str, proxy: dict = None) -> dict:
    results = {}
    for city, (lat, lon) in cities.items():
        try:
            df = get_historical_weather(lat, lon, start, end)
            results[city] = df
            print(f"Done {city}: {len(df)} days")
        except Exception as e:
            print(f"Error {city}: {e}")
        time.sleep(0.5)

    return results


data = collect_multi_city(CITIES, "2025-01-01", "2025-12-31")

NOAA Climate Data Online

NOAA's CDO API is the authoritative source for US weather station data. Get a free token at ncdc.noaa.gov/cdo-web/token.

NOAA_TOKEN = "YOUR_TOKEN"

def get_noaa_data(station_id: str, start: str, end: str, dataset: str = "GHCND") -> list:
    \"\"\"Fetch daily weather from NOAA CDO API.

    station_id: e.g., 'GHCND:USW00094728' (Central Park, NYC)
    \"\"\"
    url = "https://www.ncdc.noaa.gov/cdo-web/api/v2/data"
    headers = {"token": NOAA_TOKEN}
    params = {
        "datasetid": dataset,
        "stationid": station_id,
        "startdate": start,
        "enddate": end,
        "datatypeid": "TMAX,TMIN,PRCP,SNOW,AWND,TAVG",
        "units": "metric",
        "limit": 1000,
    }

    all_records = []
    offset = 1

    while True:
        params["offset"] = offset
        resp = requests.get(url, headers=headers, params=params, timeout=15)
        resp.raise_for_status()
        data = resp.json()

        results = data.get("results", [])
        if not results:
            break

        all_records.extend(results)
        offset += len(results)

        total_count = data.get("metadata", {}).get("resultset", {}).get("count", 0)
        if offset > total_count:
            break
        time.sleep(0.25)

    return all_records


# Central Park station, full year 2025
records = get_noaa_data("GHCND:USW00094728", "2025-01-01", "2025-12-31")
for r in records[:5]:
    print(f"{r['date'][:10]} | {r['datatype']}: {r['value']}")

Pivoting NOAA Records to Wide Format

NOAA returns one row per data type per day. Pivot to get one row per day:

def pivot_noaa_records(records: list) -> pd.DataFrame:
    df = pd.DataFrame(records)
    if df.empty:
        return df

    df["date"] = pd.to_datetime(df["date"])
    pivoted = df.pivot_table(index="date", columns="datatype", values="value", aggfunc="first")
    pivoted.columns.name = None
    pivoted = pivoted.reset_index()

    # Rename standard columns
    rename_map = {
        "TMAX": "temp_max_c",
        "TMIN": "temp_min_c",
        "TAVG": "temp_avg_c",
        "PRCP": "precip_mm",
        "SNOW": "snowfall_mm",
        "AWND": "avg_wind_speed_ms",
    }
    pivoted = pivoted.rename(columns={k: v for k, v in rename_map.items() if k in pivoted.columns})
    return pivoted

Finding Weather Stations

def find_stations(lat: float, lon: float, radius_deg: float = 0.5) -> list:
    \"\"\"Find NOAA weather stations near a location.\"\"\"
    url = "https://www.ncdc.noaa.gov/cdo-web/api/v2/stations"
    headers = {"token": NOAA_TOKEN}
    params = {
        "datasetid": "GHCND",
        "extent": f"{lat-radius_deg},{lon-radius_deg},{lat+radius_deg},{lon+radius_deg}",
        "limit": 25,
    }

    resp = requests.get(url, headers=headers, params=params, timeout=15)
    data = resp.json()

    stations = []
    for s in data.get("results", []):
        stations.append({
            "id": s["id"],
            "name": s["name"],
            "lat": s.get("latitude"),
            "lon": s.get("longitude"),
            "elevation_m": s.get("elevation"),
            "min_date": s.get("mindate"),
            "max_date": s.get("maxdate"),
        })

    return stations


# Find stations near Chicago
chicago_stations = find_stations(41.8781, -87.6298)
for s in chicago_stations[:5]:
    print(f"{s['id']}: {s['name']} ({s['min_date']} to {s['max_date']})")

Scraping Weather Underground

Weather Underground has data from 250,000+ personal weather stations -- granularity you can't get from NOAA or Open-Meteo. But it requires scraping.

from bs4 import BeautifulSoup
import json

def scrape_wunderground(station_id: str, date_str: str, proxy: str = None) -> dict:
    \"\"\"Scrape daily weather from Weather Underground.

    station_id: e.g., 'KNYNEWYO722' (a PWS in New York)
    date_str: 'YYYY-MM-DD'
    proxy: e.g., 'http://USER:[email protected]:9000'
    \"\"\"
    url = f"https://www.wunderground.com/dashboard/pws/{station_id}/table/{date_str}/{date_str}/daily"
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml",
        "Accept-Language": "en-US,en;q=0.9",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
    }

    proxies = {"https": proxy, "http": proxy} if proxy else None
    resp = requests.get(url, headers=headers, proxies=proxies, timeout=20)

    soup = BeautifulSoup(resp.text, "html.parser")

    # Weather Underground embeds data in JSON within script tags
    for script in soup.find_all("script"):
        text = script.string or ""
        if '"observations"' in text:
            start = text.find('{"observations"')
            if start >= 0:
                # Find the matching closing brace
                depth = 0
                end = start
                for i, c in enumerate(text[start:]):
                    if c == '{':
                        depth += 1
                    elif c == '}':
                        depth -= 1
                        if depth == 0:
                            end = start + i + 1
                            break
                try:
                    return json.loads(text[start:end])
                except json.JSONDecodeError:
                    pass

    return {}


# Use residential proxy to avoid Cloudflare blocks
PROXY = "http://USER:[email protected]:9000"
data = scrape_wunderground("KNYNEWYO722", "2025-06-15", proxy=PROXY)
observations = data.get("observations", [])
if observations:
    first = observations[0]
    print(f"Temp: {first.get('metric', {}).get('tempAvg')}C")
    print(f"Precip: {first.get('metric', {}).get('precipTotal')}mm")

Calculating Derived Metrics

Raw temperature and precipitation data is more useful with derived metrics:

def add_derived_metrics(df: pd.DataFrame) -> pd.DataFrame:
    \"\"\"Add derived weather metrics to a daily dataframe.\"\"\"
    df = df.copy()

    # Heating/Cooling Degree Days (base 18.3C / 65F)
    base = 18.3
    df["hdd"] = (base - df["temp_mean_c"]).clip(lower=0)
    df["cdd"] = (df["temp_mean_c"] - base).clip(lower=0)

    # Growing Degree Days (base 10C for many crops)
    gdd_base = 10.0
    df["gdd"] = ((df["temp_max_c"] + df["temp_min_c"]) / 2 - gdd_base).clip(lower=0)

    # Heat index (simplified for high temp/humidity days)
    df["feels_hot"] = (df["temp_max_c"] > 32) & (df["humidity_pct"] > 60)

    # Frost days
    df["frost_day"] = df["temp_min_c"] < 0

    # Heavy rain days
    df["heavy_rain"] = df["precipitation_mm"] > 25

    return df


df = get_historical_weather(40.7128, -74.0060, "2025-01-01", "2025-12-31")
df = add_derived_metrics(df)
print(f"Heating degree days: {df['hdd'].sum():.0f}")
print(f"Cooling degree days: {df['cdd'].sum():.0f}")
print(f"Frost days: {df['frost_day'].sum()}")
print(f"Heavy rain days: {df['heavy_rain'].sum()}")

Storing and Analyzing Weather Data

import sqlite3

def init_weather_db(path: str = "weather.db"):
    conn = sqlite3.connect(path)
    conn.execute(\"\"\"
        CREATE TABLE IF NOT EXISTS daily_weather (
            city TEXT,
            lat REAL,
            lon REAL,
            date TEXT,
            temp_max_c REAL,
            temp_min_c REAL,
            temp_mean_c REAL,
            precip_mm REAL,
            wind_max_kmh REAL,
            humidity_pct REAL,
            snowfall_cm REAL,
            hdd REAL,
            cdd REAL,
            gdd REAL,
            source TEXT,
            PRIMARY KEY (city, date, source)
        )
    \"\"\")
    conn.commit()
    return conn


def save_weather(conn, city: str, lat: float, lon: float, df: pd.DataFrame, source: str = "open-meteo"):
    for _, row in df.iterrows():
        conn.execute(
            "INSERT OR REPLACE INTO daily_weather VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)",
            (
                city, lat, lon, str(row["date"].date()),
                row.get("temp_max_c"), row.get("temp_min_c"), row.get("temp_mean_c"),
                row.get("precipitation_mm"), row.get("wind_max_kmh"), row.get("humidity_pct"),
                row.get("snowfall_cm"), row.get("hdd"), row.get("cdd"), row.get("gdd"),
                source,
            )
        )
    conn.commit()


def query_anomalies(conn, city: str, metric: str = "temp_mean_c", z_threshold: float = 2.0) -> list[dict]:
    \"\"\"Find days with anomalous weather using z-scores.\"\"\"
    import statistics
    rows = conn.execute(
        f"SELECT date, {metric} FROM daily_weather WHERE city = ? AND {metric} IS NOT NULL ORDER BY date",
        (city,)
    ).fetchall()

    if len(rows) < 10:
        return []

    values = [r[1] for r in rows]
    mean = statistics.mean(values)
    stdev = statistics.stdev(values)

    anomalies = []
    for date, val in rows:
        if stdev > 0:
            z = (val - mean) / stdev
            if abs(z) >= z_threshold:
                anomalies.append({
                    "date": date,
                    "value": round(val, 1),
                    "z_score": round(z, 2),
                    "direction": "hot/wet/windy" if z > 0 else "cold/dry/calm",
                })

    return sorted(anomalies, key=lambda x: abs(x["z_score"]), reverse=True)

Building a Weather Comparison Tool

Compare weather across multiple cities and years:

def compare_cities_annual(cities: dict, year: int, db_path: str = "weather.db") -> pd.DataFrame:
    \"\"\"Compare annual weather metrics across cities.\"\"\"
    conn = init_weather_db(db_path)
    start = f"{year}-01-01"
    end = f"{year}-12-31"

    # Collect and store data for each city
    for city, (lat, lon) in cities.items():
        df = get_historical_weather(lat, lon, start, end)
        df = add_derived_metrics(df)
        save_weather(conn, city, lat, lon, df)
        time.sleep(0.5)

    # Query annual summaries
    summary_rows = conn.execute(\"\"\"
        SELECT city,
               round(avg(temp_mean_c), 1) as avg_temp,
               round(max(temp_max_c), 1) as max_temp,
               round(min(temp_min_c), 1) as min_temp,
               round(sum(precip_mm), 0) as total_precip,
               sum(CASE WHEN temp_min_c < 0 THEN 1 ELSE 0 END) as frost_days,
               round(sum(hdd), 0) as total_hdd,
               round(sum(cdd), 0) as total_cdd
        FROM daily_weather
        WHERE date >= ? AND date <= ? AND source = 'open-meteo'
        GROUP BY city
        ORDER BY avg_temp DESC
    \"\"\", (start, end)).fetchall()

    conn.close()

    columns = ["city", "avg_temp_c", "max_temp_c", "min_temp_c", "total_precip_mm", "frost_days", "hdd", "cdd"]
    return pd.DataFrame(summary_rows, columns=columns)


comparison = compare_cities_annual(CITIES, 2025)
print(comparison.to_string(index=False))

Legal Considerations

Open-Meteo is explicitly free and open-source -- no restrictions on commercial use. NOAA data is public domain (US government). Weather Underground's data comes from personal weather stations whose owners opted into sharing, but WU's Terms of Service restrict scraping. For production use, consider WU's paid API tier. Open-Meteo and NOAA cover the vast majority of use cases without legal concerns.

Key Takeaways

Open-Meteo first -- free, no API key, global coverage back to 1940. It handles 90% of historical weather needs.
NOAA CDO is authoritative for US station data and goes back centuries for some locations.
Weather Underground fills gaps with personal station data but requires scraping with residential proxies. ThorData's rotating residential IPs handle Cloudflare and per-IP throttling that blocks datacenter proxies.
For bulk city collection (100+ locations), rate limits hit fast from a single IP -- proxy rotation keeps your collection running.
Add derived metrics (HDD, CDD, GDD, frost days) to make raw temperature data actionable.
Store everything in SQLite with city/date/source as the composite key for easy cross-source comparison.
Z-score anomaly detection on historical data surfaces extreme weather events worth investigating further.

Climate Change Signal Detection

With decades of historical data, you can detect temperature trend signals:

import numpy as np

def detect_warming_trend(df: pd.DataFrame, column: str = "temp_mean_c") -> dict:
    """Fit a linear trend to detect warming/cooling signals."""
    df_clean = df.dropna(subset=[column]).copy()
    if len(df_clean) < 30:
        return {"error": "Not enough data points"}

    # Ordinal day numbers for regression
    df_clean["day_num"] = (df_clean["date"] - df_clean["date"].min()).dt.days
    x = df_clean["day_num"].values
    y = df_clean[column].values

    # Linear regression via least squares
    n = len(x)
    slope = (n * np.sum(x * y) - np.sum(x) * np.sum(y)) / (n * np.sum(x**2) - np.sum(x)**2)
    intercept = (np.sum(y) - slope * np.sum(x)) / n

    # Trend in degrees per decade
    trend_per_decade = slope * 3650

    return {
        "column": column,
        "years_of_data": round(x.max() / 365, 1),
        "trend_per_decade_c": round(trend_per_decade, 3),
        "direction": "warming" if slope > 0 else "cooling",
        "start_mean": round(y[:30].mean(), 2),
        "end_mean": round(y[-30:].mean(), 2),
        "total_change_c": round(y[-30:].mean() - y[:30].mean(), 2),
    }


# NYC temperature trend 1980-2025
nyc_data = get_historical_weather(40.7128, -74.0060, "1980-01-01", "2025-12-31")
trend = detect_warming_trend(nyc_data)
print(f"NYC temperature trend: {trend['trend_per_decade_c']}C per decade ({trend['direction']})")
print(f"Period: {trend['years_of_data']} years")
print(f"Start avg: {trend['start_mean']}C -> End avg: {trend['end_mean']}C (change: {trend['total_change_c']}C)")

Agricultural Weather Analysis

Growing degree days and frost dates are critical for agricultural planning:

def agricultural_season_summary(lat: float, lon: float, year: int) -> dict:
    """Generate agricultural weather summary for a growing season."""
    start = f"{year}-01-01"
    end = f"{year}-12-31"

    df = get_historical_weather(lat, lon, start, end)
    df = add_derived_metrics(df)

    # Last spring frost (last day below 0C before July 1)
    spring_frosts = df[(df["date"].dt.month < 7) & (df["temp_min_c"] < 0)]
    last_spring_frost = spring_frosts["date"].max() if not spring_frosts.empty else None

    # First fall frost (first day below 0C after August 1)
    fall_frosts = df[(df["date"].dt.month > 7) & (df["temp_min_c"] < 0)]
    first_fall_frost = fall_frosts["date"].min() if not fall_frosts.empty else None

    # Growing season length
    growing_season_days = None
    if last_spring_frost is not None and first_fall_frost is not None:
        growing_season_days = (first_fall_frost - last_spring_frost).days

    return {
        "year": year,
        "last_spring_frost": str(last_spring_frost.date()) if last_spring_frost is not None else "None",
        "first_fall_frost": str(first_fall_frost.date()) if first_fall_frost is not None else "None",
        "growing_season_days": growing_season_days,
        "total_gdd_base10": round(df["gdd"].sum(), 0),
        "total_hdd": round(df["hdd"].sum(), 0),
        "total_cdd": round(df["cdd"].sum(), 0),
        "annual_precip_mm": round(df["precipitation_mm"].sum(), 0),
        "frost_days": int(df["frost_day"].sum()),
    }


# Agricultural summary for Iowa farmland (corn belt)
iowa_summary = agricultural_season_summary(42.0, -93.6, 2025)
for k, v in iowa_summary.items():
    print(f"  {k}: {v}")

Comparing Actual vs Climate Normal

Use 30-year averages as a baseline to identify anomalous years:

def compare_year_to_normal(lat: float, lon: float, target_year: int, baseline_start: int = 1991, baseline_end: int = 2020) -> pd.DataFrame:
    """Compare a year's weather to the 30-year climate normal."""
    # Fetch target year
    target_df = get_historical_weather(lat, lon, f"{target_year}-01-01", f"{target_year}-12-31")
    target_df["month"] = target_df["date"].dt.month

    # Fetch baseline period (in chunks to avoid timeout)
    baseline_dfs = []
    for year in range(baseline_start, baseline_end + 1):
        df = get_historical_weather(lat, lon, f"{year}-01-01", f"{year}-12-31")
        df["year"] = year
        df["month"] = df["date"].dt.month
        baseline_dfs.append(df)
        time.sleep(0.2)

    baseline = pd.concat(baseline_dfs)

    # Calculate monthly normals
    monthly_normal = baseline.groupby("month").agg({
        "temp_mean_c": "mean",
        "precipitation_mm": "sum",
    }).rename(columns={"temp_mean_c": "normal_temp", "precipitation_mm": "normal_precip"})
    monthly_normal["normal_precip"] = monthly_normal["normal_precip"] / (baseline_end - baseline_start + 1)

    # Aggregate target year by month
    monthly_target = target_df.groupby("month").agg({
        "temp_mean_c": "mean",
        "precipitation_mm": "sum",
    }).rename(columns={"temp_mean_c": "actual_temp", "precipitation_mm": "actual_precip"})

    # Merge and calculate anomalies
    comparison = monthly_normal.join(monthly_target)
    comparison["temp_anomaly"] = (comparison["actual_temp"] - comparison["normal_temp"]).round(2)
    comparison["precip_anomaly_pct"] = ((comparison["actual_precip"] - comparison["normal_precip"]) / comparison["normal_precip"] * 100).round(1)

    return comparison

Rainfall Pattern Analysis

def analyze_rainfall_patterns(df: pd.DataFrame) -> dict:
    """Analyze rainfall patterns: wet spells, dry spells, intensity."""
    rainy_days = df[df["precipitation_mm"] > 1.0]
    dry_days = df[df["precipitation_mm"] <= 1.0]

    # Consecutive dry spell lengths
    df_copy = df.copy()
    df_copy["is_dry"] = df_copy["precipitation_mm"] <= 1.0
    df_copy["spell_id"] = (df_copy["is_dry"] != df_copy["is_dry"].shift()).cumsum()
    dry_spells = df_copy[df_copy["is_dry"]].groupby("spell_id").size()

    return {
        "rainy_days": len(rainy_days),
        "dry_days": len(dry_days),
        "rainy_day_pct": round(len(rainy_days) / len(df) * 100, 1),
        "avg_rain_on_rainy_days": round(rainy_days["precipitation_mm"].mean(), 1),
        "max_daily_rain": round(df["precipitation_mm"].max(), 1),
        "longest_dry_spell": int(dry_spells.max()) if not dry_spells.empty else 0,
        "avg_dry_spell": round(dry_spells.mean(), 1) if not dry_spells.empty else 0,
        "heavy_rain_days_25mm": int((df["precipitation_mm"] > 25).sum()),
        "extreme_rain_days_50mm": int((df["precipitation_mm"] > 50).sum()),
    }