Technology  /  NLP

💬 Natural Language Processing 40 guides · updated 2026

From tokenisation and embeddings to transformer-based language understanding — the NLP fundamentals that underpin every modern LLM.

Extracting Email Addresses Using NLP

Email extraction pulls valid email addresses from unstructured text — an essential step for contact management, data enrichment, lead generation, and compliance workflows.


Basic Regex Extraction

Regular expressions are the most reliable approach for standard email formats:

import re
def extract_emails_basic(text):
pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
return re.findall(pattern, text)
text = """
Contact our team at support@example.com for general inquiries.
For sales, reach alice.jones@company.co.uk or bob_smith@tech.org.
Technical support: dev+nlp@api.service.io
Invalid examples: not-an-email @missinglocal .com
"""
emails = extract_emails_basic(text)
print("Found emails:", emails)
# ['support@example.com', 'alice.jones@company.co.uk', 'bob_smith@tech.org', 'dev+nlp@api.service.io']

Robust Email Pattern with Validation

import re
EMAIL_REGEX = re.compile(
r"""
(?:[a-zA-Z0-9!#$%&'*+/=?^_`{|}~-]+ # local part: standard chars
(?:\.[a-zA-Z0-9!#$%&'*+/=?^_`{|}~-]+)*) # local part: dot segments
@ # at sign
(?:[a-zA-Z0-9] # domain: first char
(?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])? # domain: middle
\.)+ # domain: dots
[a-zA-Z]{2,} # TLD: 2+ chars
""",
re.VERBOSE
)
def extract_emails_robust(text):
return EMAIL_REGEX.findall(text)
text = """
Technical contact: engineer@startup.io
Support: help@service.co.uk
Admin: admin@subdomain.company.com
Old-style: user@[127.0.0.1]
Bogus: @@notvalid.com
"""
emails = extract_emails_robust(text)
print("Extracted:", emails)

Handling Obfuscated Emails

People often obfuscate emails to avoid scraping:

import re
def deobfuscate_email(text):
"""Normalize common obfuscation patterns."""
replacements = [
(r'\[at\]', '@'),
(r'\(at\)', '@'),
(r'\s+at\s+', '@'),
(r'\[dot\]', '.'),
(r'\(dot\)', '.'),
(r'\s+dot\s+', '.'),
]
for pattern, replacement in replacements:
text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
return text
obfuscated_examples = [
"Contact us at info [at] company [dot] com",
"Reach admin(at)example(dot)org for support",
"Email: sales at domain dot io",
"Reach out to hello@normal.com normally",
]
EMAIL_PATTERN = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
for example in obfuscated_examples:
cleaned = deobfuscate_email(example)
emails = EMAIL_PATTERN.findall(cleaned)
print(f"Original: {example}")
print(f"Extracted: {emails}\n")

Bulk Processing Multiple Documents

import re
from pathlib import Path
EMAIL_PATTERN = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
def extract_emails_from_text(text):
return list(set(EMAIL_PATTERN.findall(text)))
# Simulate processing multiple documents
documents = {
"contract.txt": "Parties: Alice Smith (alice@lawfirm.com) and Bob Jones (bjones@corp.io). Questions to legal@example.com.",
"readme.md": "Report bugs to bugs@github.com or contact maintainers at dev@opensource.org.",
"newsletter.html": "Unsubscribe: unsubscribe@newsletter.com | Help: help@newsletter.com",
"support_ticket.txt": "User reported by: jane.doe@customer.co.uk. Assigned to: support-team@internal.company.com"
}
all_results = {}
for filename, content in documents.items():
emails = extract_emails_from_text(content)
all_results[filename] = emails
print(f"{filename}: {emails}")
# Aggregate unique emails
all_emails = set()
for emails in all_results.values():
all_emails.update(emails)
print(f"\nTotal unique emails found: {len(all_emails)}")
print("All emails:", sorted(all_emails))

Validating Extracted Emails

import re
import socket
def is_valid_email_format(email):
pattern = re.compile(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$')
return bool(pattern.match(email))
def has_valid_domain_syntax(email):
"""Check domain has at least one dot and valid TLD length."""
domain = email.split('@')[1] if '@' in email else ''
parts = domain.split('.')
return len(parts) >= 2 and 2 <= len(parts[-1]) <= 6
# Optional: DNS lookup (network required)
def domain_exists(email):
try:
domain = email.split('@')[1]
socket.getaddrinfo(domain, None)
return True
except (socket.gaierror, IndexError):
return False
test_emails = [
"valid@example.com",
"user.name+tag@subdomain.company.co.uk",
"invalid@",
"@nodomain.com",
"no-at-sign.com",
"double@@at.com"
]
for email in test_emails:
fmt = is_valid_email_format(email)
dom = has_valid_domain_syntax(email) if fmt else False
print(f"{'✓' if fmt and dom else '✗'} {email:<40} format: {fmt}, domain: {dom}")

Using spaCy for Contextual Email Extraction

For emails embedded in structured text, spaCy’s token context helps filter false positives:

import spacy
import re
nlp = spacy.load("en_core_web_sm")
EMAIL_PATTERN = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
def extract_emails_with_context(text):
emails_with_context = []
doc = nlp(text)
for match in EMAIL_PATTERN.finditer(text):
email = match.group()
start_char = match.start()
# Find surrounding text
start = max(0, start_char - 40)
end = min(len(text), start_char + len(email) + 40)
context = text[start:end]
emails_with_context.append({
"email": email,
"context": context.strip()
})
return emails_with_context
text = """
For customer inquiries: support@company.com (Monday-Friday 9-5 EST).
Engineering lead: jane.chen@company.com
Partnerships: biz@company.io - response within 24 hours guaranteed.
"""
results = extract_emails_with_context(text)
for r in results:
print(f"Email: {r['email']}")
print(f"Context: ...{r['context']}...\n")

Complete Email Extraction Pipeline

import re
from collections import defaultdict
EMAIL_PATTERN = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
def pipeline_extract_emails(text):
"""Full extraction pipeline: deobfuscate, extract, deduplicate, validate."""
# Step 1: Deobfuscate
text = re.sub(r'\[at\]|\(at\)|\s+at\s+', '@', text, flags=re.IGNORECASE)
text = re.sub(r'\[dot\]|\(dot\)|\s+dot\s+', '.', text, flags=re.IGNORECASE)
# Step 2: Extract
raw_emails = EMAIL_PATTERN.findall(text)
# Step 3: Normalize and deduplicate
normalized = list(set(e.lower() for e in raw_emails))
# Step 4: Basic validation
valid = [e for e in normalized if len(e.split('@')[0]) >= 1 and '.' in e.split('@')[1]]
# Step 5: Categorize by domain
by_domain = defaultdict(list)
for email in valid:
domain = email.split('@')[1]
by_domain[domain].append(email)
return {"emails": sorted(valid), "by_domain": dict(by_domain), "count": len(valid)}
text = """
Please reach out to:
- Sales team: sales [at] company [dot] com
- HR Department: hr@company.com or recruiting@company.com
- CEO: ceo@company.io (for press inquiries only)
- Support: help at support dot io
"""
result = pipeline_extract_emails(text)
print(f"Found {result['count']} email(s):")
for email in result['emails']:
print(f" {email}")
print("\nBy domain:")
for domain, emails in result['by_domain'].items():
print(f" {domain}: {emails}")