Extracting Names of People, Cities, and Countries Using NLP

Extracting named entities — people, cities, countries, organizations — from text is one of the most common practical NLP tasks. It powers journalism tools, contact enrichment, geography tagging, and compliance systems.

Extracting Names with spaCy NER

spaCy’s NER pipeline identifies people (PERSON), geopolitical entities (GPE), locations (LOC), and organizations (ORG):

import spacy

nlp = spacy.load("en_core_web_sm")

def extract_names(text):
    doc = nlp(text)
    results = {
        "people": [],
        "cities_countries": [],
        "locations": [],
        "organizations": []
    }

    for ent in doc.ents:
        if ent.label_ == "PERSON":
            results["people"].append(ent.text)
        elif ent.label_ == "GPE":         # countries, cities, states
            results["cities_countries"].append(ent.text)
        elif ent.label_ == "LOC":          # mountains, rivers, landmarks
            results["locations"].append(ent.text)
        elif ent.label_ == "ORG":
            results["organizations"].append(ent.text)

    # Deduplicate while preserving order
    for key in results:
        results[key] = list(dict.fromkeys(results[key]))

    return results

text = """
Elon Musk's xAI, headquartered in Memphis, Tennessee, launched the Grok-2 model
in August 2024. Meta's CEO Mark Zuckerberg announced Llama 3 would be available
for researchers in Paris, London, and Tokyo. Meanwhile, Sundar Pichai confirmed
Google DeepMind's expanded operations in Zurich and New York.
"""

result = extract_names(text)
for category, names in result.items():
    print(f"{category}: {names}")

Extracting Country Names with a Reference List

For higher precision when you only need country names:

import spacy
import pycountry

nlp = spacy.load("en_core_web_sm")

# Build a set of country names for fast lookup
COUNTRY_NAMES = {country.name.lower() for country in pycountry.countries}
COUNTRY_NAMES.update({country.alpha_2.lower() for country in pycountry.countries})
COUNTRY_NAMES.update({country.alpha_3.lower() for country in pycountry.countries})

# Add common name variants
COUNTRY_NAMES.update({
    "usa", "us", "uk", "uae", "south korea", "north korea",
    "taiwan", "russia", "iran", "czech republic"
})

def extract_countries(text):
    doc = nlp(text)

    # From NER
    ner_countries = [ent.text for ent in doc.ents if ent.label_ == "GPE"]

    # Validate against reference list
    validated = [name for name in ner_countries if name.lower() in COUNTRY_NAMES]

    return list(set(validated))

text = """
The AI summit was attended by delegates from France, Germany, Japan, Canada,
and the United Kingdom. Observers from Brazil, India, and South Korea participated
via video link.
"""

countries = extract_countries(text)
print("Countries found:", countries)

Person Name Extraction with Transformer NER

Transformer-based models achieve higher accuracy, especially for names in challenging contexts:

from transformers import pipeline

ner_pipeline = pipeline(
    "ner",
    model="dslim/bert-base-NER",
    aggregation_strategy="simple"
)

def extract_people_and_places(text):
    entities = ner_pipeline(text)

    people = [e["word"] for e in entities if e["entity_group"] == "PER"]
    locations = [e["word"] for e in entities if e["entity_group"] == "LOC"]
    organizations = [e["word"] for e in entities if e["entity_group"] == "ORG"]

    return {
        "people": list(set(people)),
        "locations": list(set(locations)),
        "organizations": list(set(organizations))
    }

texts = [
    "Satya Nadella and Mustafa Suleyman presented at Microsoft Build in Seattle.",
    "The WHO director-general Tedros Adhanom Ghebreyesus spoke at the Geneva conference.",
    "Researchers at MIT Cambridge and Oxford collaborated on the NLP benchmark."
]

for text in texts:
    result = extract_people_and_places(text)
    print(f"Text: {text[:60]}...")
    for key, values in result.items():
        if values:
            print(f"  {key}: {values}")
    print()

Handling Name Disambiguation

The same string can refer to different entities:

import spacy

nlp = spacy.load("en_core_web_sm")

texts = [
    "Apple released a new chip architecture at their California headquarters.",
    "She picked up an apple from the farmers market in Vermont.",
    "Amazon is expanding its warehouse network in Poland and Romania.",
    "They kayaked down the Amazon through Brazil for three weeks."
]

for text in texts:
    doc = nlp(text)
    for ent in doc.ents:
        print(f"[{ent.label_}] '{ent.text}' in: '{text[:55]}'")
    print()

Context from surrounding words helps the neural NER model disambiguate “Apple” (ORG) from “apple” (food).

Building a Complete Name Extraction Pipeline

import spacy
from collections import Counter

nlp = spacy.load("en_core_web_sm")

def extract_all_names(documents):
    results = []

    for doc_text in documents:
        doc = nlp(doc_text)

        entities = {
            "text": doc_text[:80],
            "people": [],
            "cities": [],
            "countries": [],
            "places": []
        }

        for ent in doc.ents:
            if ent.label_ == "PERSON" and len(ent.text.split()) >= 2:
                entities["people"].append(ent.text)
            elif ent.label_ == "GPE":
                # Simple heuristic: multi-word GPE = likely city, single = country or city
                entities["cities"].append(ent.text)
            elif ent.label_ == "LOC":
                entities["places"].append(ent.text)

        # Deduplicate
        for key in ["people", "cities", "places"]:
            entities[key] = list(set(entities[key]))

        results.append(entities)

    return results

news_articles = [
    "Google's CEO Sundar Pichai announced new AI investments in Singapore and Mumbai during his Asia tour.",
    "Researchers from MIT, Stanford, and Carnegie Mellon published a joint paper on LLM evaluation.",
    "Prime Minister Keir Starmer met with Chancellor Olaf Scholz in Berlin to discuss AI regulation.",
]

for result in extract_all_names(news_articles):
    print(f"Article: {result['text']}...")
    print(f"  People: {result['people']}")
    print(f"  Cities/Countries: {result['cities']}")
    print()

Frequency Analysis Across Documents

import spacy
from collections import Counter

nlp = spacy.load("en_core_web_sm")

corpus = """
Sam Altman presented in San Francisco. Dario Amodei spoke in Washington DC.
Jensen Huang keynoted in Las Vegas. Sam Altman and Dario Amodei debated AI safety.
Google DeepMind in London published new research. Microsoft in Redmond and New York announced partnerships.
"""

doc = nlp(corpus)

person_counter = Counter()
place_counter = Counter()

for ent in doc.ents:
    if ent.label_ == "PERSON":
        person_counter[ent.text] += 1
    elif ent.label_ in ("GPE", "LOC"):
        place_counter[ent.text] += 1

print("Most mentioned people:")
for name, count in person_counter.most_common(5):
    print(f"  {name}: {count}x")

print("\nMost mentioned places:")
for place, count in place_counter.most_common(5):
    print(f"  {place}: {count}x")