Extracting Names of People, Cities, and Countries Using NLP
Extracting named entities — people, cities, countries, organizations — from text is one of the most common practical NLP tasks. It powers journalism tools, contact enrichment, geography tagging, and compliance systems.
Extracting Names with spaCy NER
spaCy’s NER pipeline identifies people (PERSON), geopolitical entities (GPE), locations (LOC), and organizations (ORG):
import spacy
nlp = spacy.load("en_core_web_sm")
def extract_names(text): doc = nlp(text) results = { "people": [], "cities_countries": [], "locations": [], "organizations": [] }
for ent in doc.ents: if ent.label_ == "PERSON": results["people"].append(ent.text) elif ent.label_ == "GPE": # countries, cities, states results["cities_countries"].append(ent.text) elif ent.label_ == "LOC": # mountains, rivers, landmarks results["locations"].append(ent.text) elif ent.label_ == "ORG": results["organizations"].append(ent.text)
# Deduplicate while preserving order for key in results: results[key] = list(dict.fromkeys(results[key]))
return results
text = """Elon Musk's xAI, headquartered in Memphis, Tennessee, launched the Grok-2 modelin August 2024. Meta's CEO Mark Zuckerberg announced Llama 3 would be availablefor researchers in Paris, London, and Tokyo. Meanwhile, Sundar Pichai confirmedGoogle DeepMind's expanded operations in Zurich and New York."""
result = extract_names(text)for category, names in result.items(): print(f"{category}: {names}")Extracting Country Names with a Reference List
For higher precision when you only need country names:
import spacyimport pycountry
nlp = spacy.load("en_core_web_sm")
# Build a set of country names for fast lookupCOUNTRY_NAMES = {country.name.lower() for country in pycountry.countries}COUNTRY_NAMES.update({country.alpha_2.lower() for country in pycountry.countries})COUNTRY_NAMES.update({country.alpha_3.lower() for country in pycountry.countries})
# Add common name variantsCOUNTRY_NAMES.update({ "usa", "us", "uk", "uae", "south korea", "north korea", "taiwan", "russia", "iran", "czech republic"})
def extract_countries(text): doc = nlp(text)
# From NER ner_countries = [ent.text for ent in doc.ents if ent.label_ == "GPE"]
# Validate against reference list validated = [name for name in ner_countries if name.lower() in COUNTRY_NAMES]
return list(set(validated))
text = """The AI summit was attended by delegates from France, Germany, Japan, Canada,and the United Kingdom. Observers from Brazil, India, and South Korea participatedvia video link."""
countries = extract_countries(text)print("Countries found:", countries)Person Name Extraction with Transformer NER
Transformer-based models achieve higher accuracy, especially for names in challenging contexts:
from transformers import pipeline
ner_pipeline = pipeline( "ner", model="dslim/bert-base-NER", aggregation_strategy="simple")
def extract_people_and_places(text): entities = ner_pipeline(text)
people = [e["word"] for e in entities if e["entity_group"] == "PER"] locations = [e["word"] for e in entities if e["entity_group"] == "LOC"] organizations = [e["word"] for e in entities if e["entity_group"] == "ORG"]
return { "people": list(set(people)), "locations": list(set(locations)), "organizations": list(set(organizations)) }
texts = [ "Satya Nadella and Mustafa Suleyman presented at Microsoft Build in Seattle.", "The WHO director-general Tedros Adhanom Ghebreyesus spoke at the Geneva conference.", "Researchers at MIT Cambridge and Oxford collaborated on the NLP benchmark."]
for text in texts: result = extract_people_and_places(text) print(f"Text: {text[:60]}...") for key, values in result.items(): if values: print(f" {key}: {values}") print()Handling Name Disambiguation
The same string can refer to different entities:
import spacy
nlp = spacy.load("en_core_web_sm")
texts = [ "Apple released a new chip architecture at their California headquarters.", "She picked up an apple from the farmers market in Vermont.", "Amazon is expanding its warehouse network in Poland and Romania.", "They kayaked down the Amazon through Brazil for three weeks."]
for text in texts: doc = nlp(text) for ent in doc.ents: print(f"[{ent.label_}] '{ent.text}' in: '{text[:55]}'") print()Context from surrounding words helps the neural NER model disambiguate “Apple” (ORG) from “apple” (food).
Building a Complete Name Extraction Pipeline
import spacyfrom collections import Counter
nlp = spacy.load("en_core_web_sm")
def extract_all_names(documents): results = []
for doc_text in documents: doc = nlp(doc_text)
entities = { "text": doc_text[:80], "people": [], "cities": [], "countries": [], "places": [] }
for ent in doc.ents: if ent.label_ == "PERSON" and len(ent.text.split()) >= 2: entities["people"].append(ent.text) elif ent.label_ == "GPE": # Simple heuristic: multi-word GPE = likely city, single = country or city entities["cities"].append(ent.text) elif ent.label_ == "LOC": entities["places"].append(ent.text)
# Deduplicate for key in ["people", "cities", "places"]: entities[key] = list(set(entities[key]))
results.append(entities)
return results
news_articles = [ "Google's CEO Sundar Pichai announced new AI investments in Singapore and Mumbai during his Asia tour.", "Researchers from MIT, Stanford, and Carnegie Mellon published a joint paper on LLM evaluation.", "Prime Minister Keir Starmer met with Chancellor Olaf Scholz in Berlin to discuss AI regulation.",]
for result in extract_all_names(news_articles): print(f"Article: {result['text']}...") print(f" People: {result['people']}") print(f" Cities/Countries: {result['cities']}") print()Frequency Analysis Across Documents
import spacyfrom collections import Counter
nlp = spacy.load("en_core_web_sm")
corpus = """Sam Altman presented in San Francisco. Dario Amodei spoke in Washington DC.Jensen Huang keynoted in Las Vegas. Sam Altman and Dario Amodei debated AI safety.Google DeepMind in London published new research. Microsoft in Redmond and New York announced partnerships."""
doc = nlp(corpus)
person_counter = Counter()place_counter = Counter()
for ent in doc.ents: if ent.label_ == "PERSON": person_counter[ent.text] += 1 elif ent.label_ in ("GPE", "LOC"): place_counter[ent.text] += 1
print("Most mentioned people:")for name, count in person_counter.most_common(5): print(f" {name}: {count}x")
print("\nMost mentioned places:")for place, count in place_counter.most_common(5): print(f" {place}: {count}x")