Technology  /  NLP

💬 Natural Language Processing 40 guides · updated 2026

From tokenisation and embeddings to transformer-based language understanding — the NLP fundamentals that underpin every modern LLM.

Resume Skill Extraction Using NLP

Resume skill extraction automatically identifies technical skills, soft skills, tools, and qualifications from candidate profiles. It’s a core component of applicant tracking systems (ATS), talent analytics, and automated screening tools.


Approach 1: Keyword Matching Against Skills Dictionary

The simplest and fastest approach — match against a curated skills list:

import re
from sklearn.feature_extraction.text import CountVectorizer
SKILLS_DATABASE = {
"programming": ["python", "javascript", "java", "typescript", "go", "rust", "c++", "ruby", "scala", "r"],
"ml_frameworks": ["pytorch", "tensorflow", "keras", "scikit-learn", "xgboost", "lightgbm", "jax"],
"nlp": ["bert", "gpt", "transformers", "spacy", "nltk", "hugging face", "langchain", "llamaindex"],
"data": ["pandas", "numpy", "spark", "sql", "postgresql", "mongodb", "redis", "airflow", "dbt"],
"cloud": ["aws", "gcp", "azure", "docker", "kubernetes", "terraform", "mlflow", "sagemaker"],
"soft_skills": ["leadership", "communication", "teamwork", "problem solving", "mentoring", "agile"],
}
def extract_skills_by_keyword(resume_text):
text_lower = resume_text.lower()
found_skills = {}
for category, skills in SKILLS_DATABASE.items():
matched = [skill for skill in skills if skill in text_lower]
if matched:
found_skills[category] = matched
return found_skills
resume = """
Jane Doe | Senior ML Engineer
San Francisco, CA | jane@example.com
EXPERIENCE
ML Engineer — TechCorp (2022-2025)
- Built NLP pipelines using PyTorch and Hugging Face Transformers (BERT, GPT fine-tuning)
- Deployed models on AWS SageMaker and GCP Vertex AI
- Led a team of 4 engineers; mentored junior engineers on MLOps best practices
- Used dbt and Airflow for ETL orchestration; PostgreSQL and Redis for storage
SKILLS
Python, TypeScript, SQL, Docker, Kubernetes, LangChain, spaCy, NLTK
scikit-learn, XGBoost, Pandas, Spark, Terraform, MLflow
"""
skills = extract_skills_by_keyword(resume)
for category, found in skills.items():
print(f"{category}: {found}")

Approach 2: NER-Based Extraction with spaCy

Train or use spaCy to identify skills as named entities:

import spacy
import re
nlp = spacy.load("en_core_web_sm")
TECH_KEYWORDS = {
"pytorch", "tensorflow", "python", "java", "javascript", "bert", "gpt",
"transformers", "spacy", "fastapi", "docker", "kubernetes", "aws", "gcp",
"azure", "sql", "mongodb", "redis", "spark", "langchain", "llm"
}
def extract_skills_spacy(resume_text):
doc = nlp(resume_text)
# 1. NER-based extraction (find PRODUCT/ORG entities that are tech tools)
ner_skills = set()
for ent in doc.ents:
if ent.label_ in ("PRODUCT", "ORG") and ent.text.lower() in TECH_KEYWORDS:
ner_skills.add(ent.text)
# 2. Token-based extraction
token_skills = set()
for token in doc:
if token.text.lower() in TECH_KEYWORDS and not token.is_stop:
token_skills.add(token.text)
# 3. Noun chunk based (handles multi-word skills)
chunk_skills = set()
for chunk in doc.noun_chunks:
if chunk.text.lower() in TECH_KEYWORDS:
chunk_skills.add(chunk.text)
all_skills = ner_skills | token_skills | chunk_skills
return sorted(all_skills)
resume = """
Senior Data Scientist with 6 years of experience.
Proficient in Python, PyTorch, and TensorFlow for deep learning model development.
Built and deployed LLM-powered applications using LangChain, BERT, and GPT-4 API.
Experienced with Docker, Kubernetes, and AWS for production ML systems.
Strong background in SQL, Spark, and MongoDB for data engineering tasks.
"""
skills = extract_skills_spacy(resume)
print("Extracted skills:", skills)

Approach 3: Transformer-Based Skill Extraction

Fine-tune a NER model on a labeled resume dataset:

from transformers import pipeline
# Use a pre-trained NER model; for production, fine-tune on labeled resumes
ner = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")
def extract_skills_transformer(resume_text, skill_whitelist=None):
entities = ner(resume_text)
# Filter for relevant entity types
skills = []
for entity in entities:
word = entity["word"].strip()
label = entity["entity_group"]
# MISC and ORG often contain tech tools in resumes
if label in ("MISC", "ORG", "PER") and len(word) > 1:
if skill_whitelist is None or word.lower() in skill_whitelist:
skills.append({
"skill": word,
"type": label,
"confidence": round(entity["score"], 3)
})
return skills
resume = "Experienced ML engineer skilled in PyTorch, AWS, and Kubernetes. Previously at Google and OpenAI."
skills = extract_skills_transformer(resume)
for s in skills:
print(f"[{s['type']} {s['confidence']}] {s['skill']}")

Approach 4: LLM-Powered Structured Extraction

For the richest, most accurate extraction:

from openai import OpenAI
import json
client = OpenAI()
def extract_resume_skills_llm(resume_text):
prompt = f"""Extract all skills and qualifications from this resume and return JSON.
Categories to identify:
- technical_skills: programming languages, frameworks, tools, platforms
- soft_skills: communication, leadership, collaboration, etc.
- certifications: named certifications or degrees
- experience_years: years of experience if mentioned
- seniority: junior/mid/senior/lead/principal if implied
Resume:
{resume_text}
Return only valid JSON with these exact keys.
"""
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"},
temperature=0
)
return json.loads(response.choices[0].message.content)
resume = """
Sarah Chen | Staff Engineer | New York
10+ years building scalable ML systems. Led teams of 8-12 engineers.
Expert in Python, PyTorch, Kubernetes, and AWS. Deep experience with NLP:
BERT fine-tuning, RAG pipelines with LangChain, vector databases (Pinecone, Weaviate).
AWS Certified ML Specialist. Published 3 papers on efficient transformer training.
Strong communicator and mentor. Experience with Agile and cross-functional collaboration.
"""
result = extract_resume_skills_llm(resume)
print(json.dumps(result, indent=2))

Matching Skills to Job Requirements

from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')
def match_resume_to_job(resume_skills, job_requirements):
if not resume_skills or not job_requirements:
return []
resume_embeddings = model.encode(resume_skills)
job_embeddings = model.encode(job_requirements)
matches = []
for req in job_requirements:
req_emb = model.encode([req])
scores = util.cos_sim(req_emb, resume_embeddings)[0]
best_match_idx = scores.argmax().item()
best_score = scores[best_match_idx].item()
matches.append({
"requirement": req,
"best_match": resume_skills[best_match_idx],
"score": round(best_score, 3),
"met": best_score > 0.7
})
return sorted(matches, key=lambda x: -x["score"])
resume_skills = ["Python", "PyTorch", "BERT fine-tuning", "AWS SageMaker", "Docker", "SQL", "Leadership"]
job_requirements = [
"Deep learning framework experience",
"Cloud platform deployment",
"Natural language processing",
"Container orchestration",
"Team leadership experience"
]
matches = match_resume_to_job(resume_skills, job_requirements)
print(f"Match rate: {sum(m['met'] for m in matches)}/{len(matches)}")
print()
for m in matches:
status = "✓" if m["met"] else "✗"
print(f"{status} [{m['score']:.3f}] {m['requirement']}")
print(f" Best match: {m['best_match']}")

Building a Complete Resume Parser

import re
import spacy
from collections import defaultdict
nlp = spacy.load("en_core_web_sm")
SKILL_PATTERNS = {
"languages": r'\b(python|javascript|typescript|java|go|rust|scala|r\b|c\+\+|ruby)\b',
"frameworks": r'\b(pytorch|tensorflow|react|fastapi|django|flask|nextjs|vue|angular)\b',
"nlp_tools": r'\b(bert|gpt|transformers|spacy|langchain|llamaindex|hugging face|nltk)\b',
"infrastructure": r'\b(docker|kubernetes|aws|gcp|azure|terraform|ansible)\b',
"databases": r'\b(sql|postgresql|mysql|mongodb|redis|elasticsearch|pinecone)\b',
}
def parse_resume(text):
text_lower = text.lower()
doc = nlp(text)
extracted = defaultdict(list)
for category, pattern in SKILL_PATTERNS.items():
matches = re.findall(pattern, text_lower)
extracted[category] = list(set(matches))
people = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
orgs = [ent.text for ent in doc.ents if ent.label_ == "ORG"]
dates = [ent.text for ent in doc.ents if ent.label_ == "DATE"]
years_match = re.search(r'(\d+)\+?\s+years?\s+(?:of\s+)?experience', text_lower)
years = int(years_match.group(1)) if years_match else None
return {
"skills": dict(extracted),
"organizations": list(set(orgs)),
"experience_years": years,
"mentioned_people": people,
"dates": dates
}