How to Scrape UK Companies House: Company Data, Directors & Financials in Python (2026)
How to Scrape UK Companies House: Company Data, Directors & Financials in Python (2026)
Companies House is the UK's register of companies. Every limited company, LLP, and many other business entities in the UK must file here. That means company details, director appointments, significant shareholdings (PSC data), annual accounts, and confirmation statements — all publicly accessible.
The best part: Companies House provides a genuinely good REST API. Free. 600 requests per 5 minutes. No rate limit games, no CAPTCHA pages, just clean JSON. You'll only need to scrape the website directly for a few edge cases where the API doesn't cover the data you want.
Why Companies House Data Is Valuable
The UK company register covers over 5 million active companies, with another 3+ million dissolved. Filing requirements mean the data has genuine depth:
- Company formation and dissolution dates — track industry cohort trends
- Director networks — see who sits on which boards, track serial entrepreneurs
- PSC (Persons with Significant Control) — ownership structures, 25%+ shareholders
- Annual accounts — revenue, profit, assets, liabilities (where filed in XBRL format)
- Confirmation statements — annual snapshot of registered addresses, officer details
- Charge registers — security interests over company assets (mortgages, debentures)
- SIC codes — industry classification, useful for sector screening
Combined with free-tier access and clean JSON responses, Companies House is arguably the most developer-friendly open government dataset in the UK.
Getting an API Key
You need a free account at the Companies House developer hub:
- Go to
https://developer.company-information.service.gov.uk/ - Register for an account
- Create an application — choose "REST API"
- You'll get an API key used as HTTP Basic Auth username with an empty password
import httpx
import base64
import time
import json
import sqlite3
from typing import Optional, Generator
from dataclasses import dataclass, field
API_KEY = "your_api_key_here"
BASE_URL = "https://api.company-information.service.gov.uk"
def ch_client(api_key: str = API_KEY) -> httpx.Client:
"""Create an authenticated Companies House API client."""
auth = base64.b64encode(f"{api_key}:".encode()).decode()
return httpx.Client(
base_url=BASE_URL,
headers={
"Authorization": f"Basic {auth}",
"Accept": "application/json",
"User-Agent": "CompaniesHouseDataBot/1.0 ([email protected])",
},
timeout=20,
)
def ch_get(client: httpx.Client, path: str, params: dict = None, retries: int = 3) -> dict:
"""
Make a Companies House API request with retry and rate limit handling.
Returns parsed JSON response.
"""
for attempt in range(retries):
try:
resp = client.get(path, params=params)
if resp.status_code == 429:
# Rate limit: 600 requests per 5 minutes
retry_after = int(resp.headers.get("Retry-After", 30))
print(f"Rate limited. Waiting {retry_after}s...")
time.sleep(retry_after)
continue
if resp.status_code == 404:
return {} # Company not found
if resp.status_code == 502 or resp.status_code == 503:
wait = 5 * (attempt + 1)
print(f"Server error {resp.status_code}. Waiting {wait}s...")
time.sleep(wait)
continue
resp.raise_for_status()
return resp.json()
except httpx.TimeoutException:
if attempt == retries - 1:
raise
time.sleep(3)
return {}
Searching for Companies
The search endpoint supports flexible text queries with filter options:
def search_companies(
query: str,
client: httpx.Client,
items_per_page: int = 20,
start_index: int = 0,
restrictions: Optional[str] = None,
) -> dict:
"""
Search for companies by name or number.
restrictions: 'active-companies', 'open-limited-partnerships', etc.
"""
params = {
"q": query,
"items_per_page": min(items_per_page, 100),
"start_index": start_index,
}
if restrictions:
params["restrictions"] = restrictions
return ch_get(client, "/search/companies", params=params)
def search_all_companies(
query: str,
client: httpx.Client,
max_results: int = 500,
) -> list[dict]:
"""Paginate through company search results."""
all_items = []
start = 0
page_size = 100
while len(all_items) < max_results:
data = search_companies(query, client, items_per_page=page_size, start_index=start)
items = data.get("items", [])
if not items:
break
all_items.extend(items)
total = data.get("total_results", 0)
if start + page_size >= min(total, max_results):
break
start += page_size
time.sleep(0.7) # Stay within 600/5min = ~2/s limit
return all_items[:max_results]
def search_officers(query: str, client: httpx.Client, limit: int = 20) -> list[dict]:
"""Search for officers (directors, secretaries) by name."""
data = ch_get(client, "/search/officers", params={"q": query, "items_per_page": limit})
return data.get("items", [])
Getting Full Company Details
def get_company(company_number: str, client: httpx.Client) -> dict:
"""Get full company profile including registered office, SIC codes, and dates."""
data = ch_get(client, f"/company/{company_number}")
if not data:
return {}
return {
"company_number": company_number,
"company_name": data.get("company_name", ""),
"company_status": data.get("company_status", ""),
"company_type": data.get("type", ""),
"date_of_creation": data.get("date_of_creation", ""),
"date_of_cessation": data.get("date_of_cessation", ""),
"registered_office": data.get("registered_office_address", {}),
"sic_codes": data.get("sic_codes", []),
"accounts": data.get("accounts", {}),
"confirmation_statement": data.get("confirmation_statement", {}),
"has_charges": data.get("has_charges", False),
"has_insolvency_history": data.get("has_insolvency_history", False),
"is_community_interest_company": data.get("is_community_interest_company", False),
"jurisdiction": data.get("jurisdiction", ""),
"registered_office_is_in_dispute": data.get("registered_office_is_in_dispute", False),
"links": data.get("links", {}),
}
def get_officers(
company_number: str,
client: httpx.Client,
active_only: bool = False,
officer_type: Optional[str] = None,
) -> list[dict]:
"""
Get company officers (directors, secretaries, LLP members).
officer_type: 'director', 'secretary', 'llp-member', etc.
"""
params = {"items_per_page": 100, "register_view": "false"}
if officer_type:
params["officer_type"] = officer_type
data = ch_get(client, f"/company/{company_number}/officers", params=params)
officers = []
for item in data.get("items", []):
if active_only and item.get("resigned_on"):
continue
officers.append({
"name": item.get("name", ""),
"role": item.get("officer_role", ""),
"appointed_on": item.get("appointed_on", ""),
"resigned_on": item.get("resigned_on"),
"nationality": item.get("nationality", ""),
"occupation": item.get("occupation", ""),
"country_of_residence": item.get("country_of_residence", ""),
"date_of_birth": {
"month": item.get("date_of_birth", {}).get("month"),
"year": item.get("date_of_birth", {}).get("year"),
},
"address": item.get("address", {}),
"identification": item.get("identification", {}),
})
return officers
def get_psc(company_number: str, client: httpx.Client) -> list[dict]:
"""
Get Persons with Significant Control.
Includes individuals, legal entities, and corporate entities holding >25%.
"""
data = ch_get(client, f"/company/{company_number}/persons-with-significant-control")
pscs = []
for item in data.get("items", []):
psc = {
"name": item.get("name", ""),
"kind": item.get("kind", ""),
"natures_of_control": item.get("natures_of_control", []),
"notified_on": item.get("notified_on", ""),
"ceased_on": item.get("ceased_on"),
"nationality": item.get("nationality", ""),
"country_of_residence": item.get("country_of_residence", ""),
"address": item.get("address", {}),
}
# For corporate PSCs
if "identification" in item:
psc["identification"] = item["identification"]
# Date of birth (only month/year — full DOB not exposed)
if "date_of_birth" in item:
psc["date_of_birth"] = {
"month": item["date_of_birth"].get("month"),
"year": item["date_of_birth"].get("year"),
}
pscs.append(psc)
return pscs
def get_psc_corporate_entities(company_number: str, client: httpx.Client) -> list[dict]:
"""Get corporate entities with significant control over a company."""
data = ch_get(
client,
f"/company/{company_number}/persons-with-significant-control",
params={"register_view": "false"},
)
return [
item for item in data.get("items", [])
if item.get("kind") in ("corporate-entity-person-with-significant-control",
"legal-person-person-with-significant-control")
]
Filing History and Document Access
def get_filing_history(
company_number: str,
client: httpx.Client,
category: Optional[str] = None,
items_per_page: int = 25,
) -> list[dict]:
"""
Get company filing history.
category: 'accounts', 'annual-return', 'confirmation-statement',
'incorporation', 'officers', 'mortgage', etc.
"""
params = {"items_per_page": min(items_per_page, 100)}
if category:
params["category"] = category
data = ch_get(client, f"/company/{company_number}/filing-history", params=params)
filings = []
for item in data.get("items", []):
filings.append({
"date": item.get("date", ""),
"type": item.get("type", ""),
"description": item.get("description", ""),
"description_values": item.get("description_values", {}),
"category": item.get("category", ""),
"subcategory": item.get("subcategory", ""),
"action_date": item.get("action_date", ""),
"transaction_id": item.get("transaction_id", ""),
"document_url": item.get("links", {}).get("document_metadata", ""),
"paper_filed": item.get("paper_filed", False),
})
return filings
def download_filing_document(transaction_id: str, client: httpx.Client) -> Optional[bytes]:
"""
Download a filing document via the document API.
Returns PDF bytes or None on failure.
"""
# First get document metadata
doc_client = httpx.Client(
base_url="https://document-api.company-information.service.gov.uk",
headers=client.headers,
timeout=30,
)
try:
meta = ch_get(doc_client, f"/document/{transaction_id}")
resources = meta.get("resources", {})
# Prefer PDF
if "application/pdf" in resources:
pdf_url = resources["application/pdf"]
resp = doc_client.get(pdf_url, follow_redirects=True)
if resp.status_code == 200:
return resp.content
except Exception as e:
print(f"Document download failed: {e}")
finally:
doc_client.close()
return None
def get_charges(company_number: str, client: httpx.Client) -> list[dict]:
"""Get mortgage charges (security interests) registered against the company."""
data = ch_get(client, f"/company/{company_number}/charges")
charges = []
for item in data.get("items", []):
charges.append({
"charge_number": item.get("charge_number", ""),
"status": item.get("status", ""),
"classification": item.get("classification", {}),
"created_on": item.get("created_on", ""),
"satisfied_on": item.get("satisfied_on"),
"delivered_on": item.get("delivered_on", ""),
"persons_entitled": item.get("persons_entitled", []),
"particulars": item.get("particulars", {}),
"secured_details": item.get("secured_details", {}),
})
return charges
def get_insolvency(company_number: str, client: httpx.Client) -> dict:
"""Get insolvency history for a company."""
return ch_get(client, f"/company/{company_number}/insolvency")
Bulk Data Collection
When you need data on thousands of companies — for market research, due diligence pipelines, or financial analysis — the API rate limit (600 requests per 5 minutes) becomes the bottleneck. That's 2 requests per second sustained. With proper pacing you can collect data on roughly 5,000-7,000 companies per hour via the API.
def bulk_company_export(
company_numbers: list[str],
output_file: str,
client: httpx.Client,
include_officers: bool = True,
include_pscs: bool = True,
include_filings: bool = False,
delay: float = 0.55,
) -> dict:
"""
Export company data in bulk, respecting rate limits.
Returns summary stats.
"""
import csv
from pathlib import Path
fieldnames = [
"company_number", "company_name", "status", "type",
"date_of_creation", "date_of_cessation", "sic_codes",
"registered_office_address_line_1", "registered_office_postcode",
"registered_office_country", "jurisdiction",
"has_charges", "has_insolvency_history",
"accounts_next_due", "accounts_last_made_up_to",
"confirmation_statement_next_due",
"active_directors", "total_pscs",
]
stats = {"success": 0, "not_found": 0, "errors": 0, "total": len(company_numbers)}
Path(output_file).parent.mkdir(parents=True, exist_ok=True)
with open(output_file, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction="ignore")
writer.writeheader()
for i, num in enumerate(company_numbers):
try:
company = get_company(num, client)
if not company:
stats["not_found"] += 1
continue
row = {
"company_number": num,
"company_name": company.get("company_name", ""),
"status": company.get("company_status", ""),
"type": company.get("company_type", ""),
"date_of_creation": company.get("date_of_creation", ""),
"date_of_cessation": company.get("date_of_cessation", ""),
"sic_codes": "|".join(company.get("sic_codes", [])),
"registered_office_address_line_1": company.get("registered_office", {}).get("address_line_1", ""),
"registered_office_postcode": company.get("registered_office", {}).get("postal_code", ""),
"registered_office_country": company.get("registered_office", {}).get("country", ""),
"jurisdiction": company.get("jurisdiction", ""),
"has_charges": company.get("has_charges", False),
"has_insolvency_history": company.get("has_insolvency_history", False),
"accounts_next_due": company.get("accounts", {}).get("next_due", ""),
"accounts_last_made_up_to": company.get("accounts", {}).get("last_accounts", {}).get("made_up_to", ""),
"confirmation_statement_next_due": company.get("confirmation_statement", {}).get("next_due", ""),
}
if include_officers:
officers = get_officers(num, client, active_only=True)
directors = [o for o in officers if o.get("role") == "director"]
row["active_directors"] = len(directors)
time.sleep(0.55)
if include_pscs:
pscs = get_psc(num, client)
row["total_pscs"] = len(pscs)
time.sleep(0.55)
writer.writerow(row)
stats["success"] += 1
if (i + 1) % 100 == 0:
print(f"Progress: {i + 1}/{len(company_numbers)} "
f"({stats['success']} ok, {stats['not_found']} not found, {stats['errors']} errors)")
except Exception as e:
print(f"Error on {num}: {e}")
stats["errors"] += 1
time.sleep(delay)
return stats
Streaming the Free Bulk Data Download
For datasets covering tens of thousands of companies, Companies House provides monthly snapshots of the full register — all ~5 million companies in CSV format. Download from http://download.companieshouse.gov.uk/en_output.html.
import csv
import zipfile
import io
def process_bulk_snapshot(
zip_path: str,
output_db: str = "companies_house_bulk.db",
filter_fn=None,
sic_filter: Optional[list[str]] = None,
) -> int:
"""
Process the Companies House bulk data snapshot.
The ZIP contains multiple CSV files (the register is split).
filter_fn: optional callable(row_dict) -> bool to filter rows.
sic_filter: list of SIC codes to include (prefix matching).
Returns count of processed companies.
"""
conn = sqlite3.connect(output_db)
conn.execute("PRAGMA journal_mode=WAL")
conn.execute("""
CREATE TABLE IF NOT EXISTS companies (
CompanyNumber TEXT PRIMARY KEY,
CompanyName TEXT,
CompanyCategory TEXT,
CompanyStatus TEXT,
CountryOfOrigin TEXT,
DissolutionDate TEXT,
IncorporationDate TEXT,
"SICCode.SicText_1" TEXT,
"SICCode.SicText_2" TEXT,
"SICCode.SicText_3" TEXT,
"SICCode.SicText_4" TEXT,
"RegAddress.PostCode" TEXT,
"RegAddress.Country" TEXT,
"Accounts.NextDueDate" TEXT,
"Accounts.LastMadeUpDate" TEXT,
imported_at TEXT DEFAULT (datetime('now'))
)
""")
conn.commit()
total = 0
with zipfile.ZipFile(zip_path) as zf:
for name in zf.namelist():
if not name.endswith(".csv"):
continue
print(f"Processing {name}...")
with zf.open(name) as csvfile:
reader = csv.DictReader(io.TextIOWrapper(csvfile, encoding="utf-8-sig"))
batch = []
for row in reader:
# Apply SIC filter if specified
if sic_filter:
row_sics = [
row.get(f"SICCode.SicText_{i}", "").split(" - ")[0].strip()
for i in range(1, 5)
]
if not any(
any(sic.startswith(f) for f in sic_filter)
for sic in row_sics if sic
):
continue
if filter_fn and not filter_fn(row):
continue
batch.append(row)
if len(batch) >= 1000:
_insert_batch(conn, batch)
total += len(batch)
batch = []
if batch:
_insert_batch(conn, batch)
total += len(batch)
conn.close()
print(f"Imported {total:,} companies to {output_db}")
return total
def _insert_batch(conn: sqlite3.Connection, rows: list[dict]):
columns = [
"CompanyNumber", "CompanyName", "CompanyCategory", "CompanyStatus",
"CountryOfOrigin", "DissolutionDate", "IncorporationDate",
"SICCode.SicText_1", "SICCode.SicText_2", "SICCode.SicText_3", "SICCode.SicText_4",
"RegAddress.PostCode", "RegAddress.Country",
"Accounts.NextDueDate", "Accounts.LastMadeUpDate",
]
placeholders = ", ".join(["?"] * len(columns))
col_names = ", ".join([f'"{c}"' for c in columns])
conn.executemany(
f"INSERT OR IGNORE INTO companies ({col_names}) VALUES ({placeholders})",
[[row.get(c, "") for c in columns] for row in rows]
)
conn.commit()
Web Scraping for Data Not in the API
A few data points aren't available through the API. The main one is parsed financial figures from accounts.
from bs4 import BeautifulSoup
def scrape_company_website_page(
company_number: str,
proxy: Optional[str] = None,
) -> dict:
"""
Scrape additional data from the Companies House website.
The API doesn't expose previous company names or certain account details.
"""
url = f"https://find-and-update.company-information.service.gov.uk/company/{company_number}"
transport = httpx.HTTPTransport(proxy=proxy) if proxy else None
client = httpx.Client(
transport=transport,
headers={
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 Chrome/126.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml",
},
timeout=15,
follow_redirects=True,
)
try:
resp = client.get(url)
resp.raise_for_status()
except Exception as e:
return {"error": str(e)}
finally:
client.close()
soup = BeautifulSoup(resp.text, "lxml")
data = {}
# Previous company names (not in API)
prev_names = []
prev_section = soup.find("section", id="previous-company-names")
if prev_section:
for row in prev_section.select("tbody tr"):
cells = row.select("td")
if len(cells) >= 2:
prev_names.append({
"name": cells[0].get_text(strip=True),
"dates": cells[1].get_text(strip=True),
})
data["previous_names"] = prev_names
# Nature of business description
nature_el = soup.find("dd", {"id": "company-status"})
if nature_el:
data["status_display"] = nature_el.get_text(strip=True)
# Registered agent / third-party representative (rarely in API)
agent_section = soup.find("section", {"id": "registered-email-address"})
if agent_section:
data["has_registered_email"] = True
return data
def parse_xbrl_accounts(xbrl_content: bytes) -> dict:
"""
Parse iXBRL/XBRL accounts to extract financial figures.
Accounts filed via Companies House online use the XBRL taxonomy.
This requires the lxml library.
"""
from lxml import etree
try:
root = etree.fromstring(xbrl_content)
except Exception as e:
return {"error": f"Parse failed: {e}"}
# Common XBRL namespaces used in UK accounts
namespaces = {
"core": "http://xbrl.frc.org.uk/fr/2014-09-01/core",
"uk-gaap": "http://xbrl.frc.org.uk/gaap/pt/2014-09-01",
"bus": "http://xbrl.frc.org.uk/cd/2014-09-01/business",
}
figures = {}
# Revenue / Turnover
for tag in ["core:Turnover", "uk-gaap:Turnover", "core:Revenue"]:
ns, local = tag.split(":")
els = root.findall(f".//{{{namespaces.get(ns, '')}}}Turnover")
if els:
try:
figures["turnover"] = float(els[0].text.replace(",", ""))
except (ValueError, TypeError):
pass
break
# Operating profit
for path in [f".//{{{namespaces['core']}}}OperatingProfitLoss"]:
els = root.findall(path)
if els:
try:
figures["operating_profit"] = float(els[0].text.replace(",", ""))
except (ValueError, TypeError):
pass
return figures
Anti-Bot Considerations for Website Scraping
The Companies House API doesn't need proxies — the rate limit is per API key, not per IP. But the website (find-and-update.company-information.service.gov.uk) applies IP-based rate limiting. Hitting the website at API speeds (2 req/s) from a single IP will get you temporarily blocked.
For high-volume website scraping — parsing filing documents, extracting financial figures across a sector, downloading XBRL accounts — ThorData's residential proxies provide the IP rotation you need. Each request appears from a different residential address, keeping individual IPs well under the threshold.
THORDATA_PROXY = "http://USER:[email protected]:9001"
def create_web_scraping_client(use_proxy: bool = True) -> httpx.Client:
transport = httpx.HTTPTransport(proxy=THORDATA_PROXY) if use_proxy else None
return httpx.Client(
transport=transport,
headers={
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 Chrome/126.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9",
"Accept-Language": "en-GB,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
},
timeout=20,
follow_redirects=True,
)
def scrape_accounts_for_sector(
company_numbers: list[str],
use_proxy: bool = True,
delay: float = 2.0,
) -> list[dict]:
"""Scrape financial data from filed accounts for a list of companies."""
client = create_web_scraping_client(use_proxy=use_proxy)
api_client = ch_client()
results = []
for num in company_numbers:
try:
# Get filing history for accounts filings
filings = get_filing_history(num, api_client, category="accounts", items_per_page=5)
if not filings:
continue
# Download most recent accounts
latest = filings[0]
tx_id = latest.get("transaction_id")
if tx_id:
pdf_bytes = download_filing_document(tx_id, api_client)
if pdf_bytes:
results.append({
"company_number": num,
"accounts_date": latest.get("date"),
"document_size": len(pdf_bytes),
"document_type": latest.get("type"),
})
except Exception as e:
print(f"Error on {num}: {e}")
time.sleep(delay)
api_client.close()
client.close()
return results
Cross-Referencing with Other UK Registries
Companies House data becomes more powerful when combined with other public sources:
def verify_vat_registration(vat_number: str) -> dict:
"""
Verify UK VAT registration via HMRC API.
No API key required for basic lookup.
"""
# Remove spaces and GB prefix for API call
clean_vat = vat_number.upper().replace("GB", "").replace(" ", "")
resp = httpx.get(
f"https://api.service.hmrc.gov.uk/organisations/vat/check-vat-number/lookup/{clean_vat}",
headers={"Accept": "application/json"},
timeout=15,
)
if resp.status_code == 200:
data = resp.json()
return {
"valid": True,
"name": data.get("target", {}).get("name", ""),
"address": data.get("target", {}).get("address", {}),
"vat_number": clean_vat,
}
return {"valid": False, "vat_number": clean_vat}
def build_company_intelligence_report(
company_number: str,
client: httpx.Client,
) -> dict:
"""Build a comprehensive company overview from multiple API calls."""
company = get_company(company_number, client)
if not company:
return {"error": f"Company {company_number} not found"}
time.sleep(0.6)
officers = get_officers(company_number, client)
time.sleep(0.6)
pscs = get_psc(company_number, client)
time.sleep(0.6)
recent_filings = get_filing_history(company_number, client, items_per_page=10)
time.sleep(0.6)
charges = get_charges(company_number, client)
active_directors = [o for o in officers if not o["resigned_on"] and o["role"] == "director"]
active_secretaries = [o for o in officers if not o["resigned_on"] and o["role"] == "secretary"]
total_resigned = len([o for o in officers if o["resigned_on"]])
# Calculate director network overlap (same person on multiple boards)
director_names = [d["name"] for d in active_directors]
return {
"company_number": company_number,
"company_name": company.get("company_name"),
"status": company.get("company_status"),
"type": company.get("company_type"),
"incorporated": company.get("date_of_creation"),
"dissolved": company.get("date_of_cessation"),
"registered_office": company.get("registered_office"),
"sic_codes": company.get("sic_codes", []),
"jurisdiction": company.get("jurisdiction"),
"has_charges": company.get("has_charges"),
"has_insolvency": company.get("has_insolvency_history"),
"accounts_next_due": company.get("accounts", {}).get("next_due"),
"accounts_overdue": company.get("accounts", {}).get("overdue", False),
"confirmation_next_due": company.get("confirmation_statement", {}).get("next_due"),
"active_directors": active_directors,
"active_director_count": len(active_directors),
"active_secretaries": active_secretaries,
"resigned_officer_count": total_resigned,
"significant_controllers": pscs,
"has_corporate_psc": any(
"corporate-entity" in p.get("kind", "") for p in pscs
),
"charges": charges,
"outstanding_charge_count": len([c for c in charges if c["status"] == "outstanding"]),
"recent_filings": recent_filings[:5],
"last_filing_date": recent_filings[0]["date"] if recent_filings else None,
}
Production Database Schema
For large-scale Companies House data pipelines:
def setup_companies_house_db(db_path: str = "ch_data.db") -> sqlite3.Connection:
conn = sqlite3.connect(db_path)
conn.execute("PRAGMA journal_mode=WAL")
conn.execute("PRAGMA synchronous=NORMAL")
conn.executescript("""
CREATE TABLE IF NOT EXISTS companies (
company_number TEXT PRIMARY KEY,
company_name TEXT,
status TEXT,
type TEXT,
date_of_creation TEXT,
date_of_cessation TEXT,
sic_codes TEXT,
postcode TEXT,
country TEXT,
jurisdiction TEXT,
has_charges INTEGER DEFAULT 0,
has_insolvency INTEGER DEFAULT 0,
accounts_next_due TEXT,
accounts_overdue INTEGER DEFAULT 0,
confirmation_next_due TEXT,
raw_json TEXT,
scraped_at TEXT DEFAULT (datetime('now')),
updated_at TEXT
);
CREATE TABLE IF NOT EXISTS officers (
id INTEGER PRIMARY KEY AUTOINCREMENT,
company_number TEXT NOT NULL,
name TEXT,
role TEXT,
appointed_on TEXT,
resigned_on TEXT,
nationality TEXT,
occupation TEXT,
country_of_residence TEXT,
dob_month INTEGER,
dob_year INTEGER,
FOREIGN KEY (company_number) REFERENCES companies(company_number),
UNIQUE (company_number, name, role, appointed_on)
);
CREATE TABLE IF NOT EXISTS pscs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
company_number TEXT NOT NULL,
name TEXT,
kind TEXT,
natures_of_control TEXT,
notified_on TEXT,
ceased_on TEXT,
nationality TEXT,
country_of_residence TEXT,
FOREIGN KEY (company_number) REFERENCES companies(company_number)
);
CREATE TABLE IF NOT EXISTS filings (
id INTEGER PRIMARY KEY AUTOINCREMENT,
company_number TEXT NOT NULL,
transaction_id TEXT UNIQUE,
date TEXT,
type TEXT,
description TEXT,
category TEXT,
FOREIGN KEY (company_number) REFERENCES companies(company_number)
);
CREATE INDEX IF NOT EXISTS idx_officers_company ON officers(company_number);
CREATE INDEX IF NOT EXISTS idx_officers_name ON officers(name);
CREATE INDEX IF NOT EXISTS idx_pscs_company ON pscs(company_number);
CREATE INDEX IF NOT EXISTS idx_companies_status ON companies(status);
CREATE INDEX IF NOT EXISTS idx_companies_sic ON companies(sic_codes);
CREATE INDEX IF NOT EXISTS idx_companies_postcode ON companies(postcode);
CREATE INDEX IF NOT EXISTS idx_filings_company ON filings(company_number);
""")
conn.commit()
return conn
Legal Considerations
Companies House data is released under the Open Government Licence (OGL), which is highly permissive:
- You can: Use the data commercially, redistribute it, adapt it, create derived datasets, publish analyses
- You must: Acknowledge the source ("Contains public sector information licensed under the Open Government Licence")
- PSC and officer data: This is published by Companies House as a statutory requirement. However, be aware that using officer/director information for certain purposes (direct marketing to individuals using their registered address, for example) may engage GDPR considerations.
- Web scraping the website: The Companies House website's terms don't explicitly prohibit scraping, but automated access that degrades the service for others would likely breach their acceptable use policy. Keep rates sensible and prefer the API where it covers your needs.
The API is the right tool for most work. The bulk download is for large-scale snapshot analysis. Reserve web scraping for the edges where the API falls short.