Resume Skill Extraction Using NLP
Resume skill extraction automatically identifies technical skills, soft skills, tools, and qualifications from candidate profiles. It’s a core component of applicant tracking systems (ATS), talent analytics, and automated screening tools.
Approach 1: Keyword Matching Against Skills Dictionary
The simplest and fastest approach — match against a curated skills list:
import refrom sklearn.feature_extraction.text import CountVectorizer
SKILLS_DATABASE = { "programming": ["python", "javascript", "java", "typescript", "go", "rust", "c++", "ruby", "scala", "r"], "ml_frameworks": ["pytorch", "tensorflow", "keras", "scikit-learn", "xgboost", "lightgbm", "jax"], "nlp": ["bert", "gpt", "transformers", "spacy", "nltk", "hugging face", "langchain", "llamaindex"], "data": ["pandas", "numpy", "spark", "sql", "postgresql", "mongodb", "redis", "airflow", "dbt"], "cloud": ["aws", "gcp", "azure", "docker", "kubernetes", "terraform", "mlflow", "sagemaker"], "soft_skills": ["leadership", "communication", "teamwork", "problem solving", "mentoring", "agile"],}
def extract_skills_by_keyword(resume_text): text_lower = resume_text.lower() found_skills = {}
for category, skills in SKILLS_DATABASE.items(): matched = [skill for skill in skills if skill in text_lower] if matched: found_skills[category] = matched
return found_skills
resume = """Jane Doe | Senior ML EngineerSan Francisco, CA | jane@example.com
EXPERIENCEML Engineer — TechCorp (2022-2025)- Built NLP pipelines using PyTorch and Hugging Face Transformers (BERT, GPT fine-tuning)- Deployed models on AWS SageMaker and GCP Vertex AI- Led a team of 4 engineers; mentored junior engineers on MLOps best practices- Used dbt and Airflow for ETL orchestration; PostgreSQL and Redis for storage
SKILLSPython, TypeScript, SQL, Docker, Kubernetes, LangChain, spaCy, NLTKscikit-learn, XGBoost, Pandas, Spark, Terraform, MLflow"""
skills = extract_skills_by_keyword(resume)for category, found in skills.items(): print(f"{category}: {found}")Approach 2: NER-Based Extraction with spaCy
Train or use spaCy to identify skills as named entities:
import spacyimport re
nlp = spacy.load("en_core_web_sm")
TECH_KEYWORDS = { "pytorch", "tensorflow", "python", "java", "javascript", "bert", "gpt", "transformers", "spacy", "fastapi", "docker", "kubernetes", "aws", "gcp", "azure", "sql", "mongodb", "redis", "spark", "langchain", "llm"}
def extract_skills_spacy(resume_text): doc = nlp(resume_text)
# 1. NER-based extraction (find PRODUCT/ORG entities that are tech tools) ner_skills = set() for ent in doc.ents: if ent.label_ in ("PRODUCT", "ORG") and ent.text.lower() in TECH_KEYWORDS: ner_skills.add(ent.text)
# 2. Token-based extraction token_skills = set() for token in doc: if token.text.lower() in TECH_KEYWORDS and not token.is_stop: token_skills.add(token.text)
# 3. Noun chunk based (handles multi-word skills) chunk_skills = set() for chunk in doc.noun_chunks: if chunk.text.lower() in TECH_KEYWORDS: chunk_skills.add(chunk.text)
all_skills = ner_skills | token_skills | chunk_skills return sorted(all_skills)
resume = """Senior Data Scientist with 6 years of experience.Proficient in Python, PyTorch, and TensorFlow for deep learning model development.Built and deployed LLM-powered applications using LangChain, BERT, and GPT-4 API.Experienced with Docker, Kubernetes, and AWS for production ML systems.Strong background in SQL, Spark, and MongoDB for data engineering tasks."""
skills = extract_skills_spacy(resume)print("Extracted skills:", skills)Approach 3: Transformer-Based Skill Extraction
Fine-tune a NER model on a labeled resume dataset:
from transformers import pipeline
# Use a pre-trained NER model; for production, fine-tune on labeled resumesner = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")
def extract_skills_transformer(resume_text, skill_whitelist=None): entities = ner(resume_text)
# Filter for relevant entity types skills = [] for entity in entities: word = entity["word"].strip() label = entity["entity_group"]
# MISC and ORG often contain tech tools in resumes if label in ("MISC", "ORG", "PER") and len(word) > 1: if skill_whitelist is None or word.lower() in skill_whitelist: skills.append({ "skill": word, "type": label, "confidence": round(entity["score"], 3) })
return skills
resume = "Experienced ML engineer skilled in PyTorch, AWS, and Kubernetes. Previously at Google and OpenAI."skills = extract_skills_transformer(resume)for s in skills: print(f"[{s['type']} {s['confidence']}] {s['skill']}")Approach 4: LLM-Powered Structured Extraction
For the richest, most accurate extraction:
from openai import OpenAIimport json
client = OpenAI()
def extract_resume_skills_llm(resume_text): prompt = f"""Extract all skills and qualifications from this resume and return JSON.Categories to identify:- technical_skills: programming languages, frameworks, tools, platforms- soft_skills: communication, leadership, collaboration, etc.- certifications: named certifications or degrees- experience_years: years of experience if mentioned- seniority: junior/mid/senior/lead/principal if implied
Resume:{resume_text}
Return only valid JSON with these exact keys."""
response = client.chat.completions.create( model="gpt-4o-mini", messages=[{"role": "user", "content": prompt}], response_format={"type": "json_object"}, temperature=0 )
return json.loads(response.choices[0].message.content)
resume = """Sarah Chen | Staff Engineer | New York
10+ years building scalable ML systems. Led teams of 8-12 engineers.Expert in Python, PyTorch, Kubernetes, and AWS. Deep experience with NLP:BERT fine-tuning, RAG pipelines with LangChain, vector databases (Pinecone, Weaviate).AWS Certified ML Specialist. Published 3 papers on efficient transformer training.Strong communicator and mentor. Experience with Agile and cross-functional collaboration."""
result = extract_resume_skills_llm(resume)print(json.dumps(result, indent=2))Matching Skills to Job Requirements
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')
def match_resume_to_job(resume_skills, job_requirements): if not resume_skills or not job_requirements: return []
resume_embeddings = model.encode(resume_skills) job_embeddings = model.encode(job_requirements)
matches = [] for req in job_requirements: req_emb = model.encode([req]) scores = util.cos_sim(req_emb, resume_embeddings)[0] best_match_idx = scores.argmax().item() best_score = scores[best_match_idx].item()
matches.append({ "requirement": req, "best_match": resume_skills[best_match_idx], "score": round(best_score, 3), "met": best_score > 0.7 })
return sorted(matches, key=lambda x: -x["score"])
resume_skills = ["Python", "PyTorch", "BERT fine-tuning", "AWS SageMaker", "Docker", "SQL", "Leadership"]job_requirements = [ "Deep learning framework experience", "Cloud platform deployment", "Natural language processing", "Container orchestration", "Team leadership experience"]
matches = match_resume_to_job(resume_skills, job_requirements)print(f"Match rate: {sum(m['met'] for m in matches)}/{len(matches)}")print()for m in matches: status = "✓" if m["met"] else "✗" print(f"{status} [{m['score']:.3f}] {m['requirement']}") print(f" Best match: {m['best_match']}")Building a Complete Resume Parser
import reimport spacyfrom collections import defaultdict
nlp = spacy.load("en_core_web_sm")
SKILL_PATTERNS = { "languages": r'\b(python|javascript|typescript|java|go|rust|scala|r\b|c\+\+|ruby)\b', "frameworks": r'\b(pytorch|tensorflow|react|fastapi|django|flask|nextjs|vue|angular)\b', "nlp_tools": r'\b(bert|gpt|transformers|spacy|langchain|llamaindex|hugging face|nltk)\b', "infrastructure": r'\b(docker|kubernetes|aws|gcp|azure|terraform|ansible)\b', "databases": r'\b(sql|postgresql|mysql|mongodb|redis|elasticsearch|pinecone)\b',}
def parse_resume(text): text_lower = text.lower() doc = nlp(text)
extracted = defaultdict(list) for category, pattern in SKILL_PATTERNS.items(): matches = re.findall(pattern, text_lower) extracted[category] = list(set(matches))
people = [ent.text for ent in doc.ents if ent.label_ == "PERSON"] orgs = [ent.text for ent in doc.ents if ent.label_ == "ORG"] dates = [ent.text for ent in doc.ents if ent.label_ == "DATE"]
years_match = re.search(r'(\d+)\+?\s+years?\s+(?:of\s+)?experience', text_lower) years = int(years_match.group(1)) if years_match else None
return { "skills": dict(extracted), "organizations": list(set(orgs)), "experience_years": years, "mentioned_people": people, "dates": dates }