Resume Skill Extraction Using NLP

Resume skill extraction automatically identifies technical skills, soft skills, tools, and qualifications from candidate profiles. It’s a core component of applicant tracking systems (ATS), talent analytics, and automated screening tools.

Approach 1: Keyword Matching Against Skills Dictionary

The simplest and fastest approach — match against a curated skills list:

import re
from sklearn.feature_extraction.text import CountVectorizer

SKILLS_DATABASE = {
    "programming": ["python", "javascript", "java", "typescript", "go", "rust", "c++", "ruby", "scala", "r"],
    "ml_frameworks": ["pytorch", "tensorflow", "keras", "scikit-learn", "xgboost", "lightgbm", "jax"],
    "nlp": ["bert", "gpt", "transformers", "spacy", "nltk", "hugging face", "langchain", "llamaindex"],
    "data": ["pandas", "numpy", "spark", "sql", "postgresql", "mongodb", "redis", "airflow", "dbt"],
    "cloud": ["aws", "gcp", "azure", "docker", "kubernetes", "terraform", "mlflow", "sagemaker"],
    "soft_skills": ["leadership", "communication", "teamwork", "problem solving", "mentoring", "agile"],
}

def extract_skills_by_keyword(resume_text):
    text_lower = resume_text.lower()
    found_skills = {}

    for category, skills in SKILLS_DATABASE.items():
        matched = [skill for skill in skills if skill in text_lower]
        if matched:
            found_skills[category] = matched

    return found_skills

resume = """
Jane Doe | Senior ML Engineer
San Francisco, CA | jane@example.com

EXPERIENCE
ML Engineer — TechCorp (2022-2025)
- Built NLP pipelines using PyTorch and Hugging Face Transformers (BERT, GPT fine-tuning)
- Deployed models on AWS SageMaker and GCP Vertex AI
- Led a team of 4 engineers; mentored junior engineers on MLOps best practices
- Used dbt and Airflow for ETL orchestration; PostgreSQL and Redis for storage

SKILLS
Python, TypeScript, SQL, Docker, Kubernetes, LangChain, spaCy, NLTK
scikit-learn, XGBoost, Pandas, Spark, Terraform, MLflow
"""

skills = extract_skills_by_keyword(resume)
for category, found in skills.items():
    print(f"{category}: {found}")

Approach 2: NER-Based Extraction with spaCy

Train or use spaCy to identify skills as named entities:

import spacy
import re

nlp = spacy.load("en_core_web_sm")

TECH_KEYWORDS = {
    "pytorch", "tensorflow", "python", "java", "javascript", "bert", "gpt",
    "transformers", "spacy", "fastapi", "docker", "kubernetes", "aws", "gcp",
    "azure", "sql", "mongodb", "redis", "spark", "langchain", "llm"
}

def extract_skills_spacy(resume_text):
    doc = nlp(resume_text)

    # 1. NER-based extraction (find PRODUCT/ORG entities that are tech tools)
    ner_skills = set()
    for ent in doc.ents:
        if ent.label_ in ("PRODUCT", "ORG") and ent.text.lower() in TECH_KEYWORDS:
            ner_skills.add(ent.text)

    # 2. Token-based extraction
    token_skills = set()
    for token in doc:
        if token.text.lower() in TECH_KEYWORDS and not token.is_stop:
            token_skills.add(token.text)

    # 3. Noun chunk based (handles multi-word skills)
    chunk_skills = set()
    for chunk in doc.noun_chunks:
        if chunk.text.lower() in TECH_KEYWORDS:
            chunk_skills.add(chunk.text)

    all_skills = ner_skills | token_skills | chunk_skills
    return sorted(all_skills)

resume = """
Senior Data Scientist with 6 years of experience.
Proficient in Python, PyTorch, and TensorFlow for deep learning model development.
Built and deployed LLM-powered applications using LangChain, BERT, and GPT-4 API.
Experienced with Docker, Kubernetes, and AWS for production ML systems.
Strong background in SQL, Spark, and MongoDB for data engineering tasks.
"""

skills = extract_skills_spacy(resume)
print("Extracted skills:", skills)

Approach 3: Transformer-Based Skill Extraction

Fine-tune a NER model on a labeled resume dataset:

from transformers import pipeline

# Use a pre-trained NER model; for production, fine-tune on labeled resumes
ner = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")

def extract_skills_transformer(resume_text, skill_whitelist=None):
    entities = ner(resume_text)

    # Filter for relevant entity types
    skills = []
    for entity in entities:
        word = entity["word"].strip()
        label = entity["entity_group"]

        # MISC and ORG often contain tech tools in resumes
        if label in ("MISC", "ORG", "PER") and len(word) > 1:
            if skill_whitelist is None or word.lower() in skill_whitelist:
                skills.append({
                    "skill": word,
                    "type": label,
                    "confidence": round(entity["score"], 3)
                })

    return skills

resume = "Experienced ML engineer skilled in PyTorch, AWS, and Kubernetes. Previously at Google and OpenAI."
skills = extract_skills_transformer(resume)
for s in skills:
    print(f"[{s['type']} {s['confidence']}] {s['skill']}")

Approach 4: LLM-Powered Structured Extraction

For the richest, most accurate extraction:

from openai import OpenAI
import json

client = OpenAI()

def extract_resume_skills_llm(resume_text):
    prompt = f"""Extract all skills and qualifications from this resume and return JSON.
Categories to identify:
- technical_skills: programming languages, frameworks, tools, platforms
- soft_skills: communication, leadership, collaboration, etc.
- certifications: named certifications or degrees
- experience_years: years of experience if mentioned
- seniority: junior/mid/senior/lead/principal if implied

Resume:
{resume_text}

Return only valid JSON with these exact keys.
"""

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        response_format={"type": "json_object"},
        temperature=0
    )

    return json.loads(response.choices[0].message.content)

resume = """
Sarah Chen | Staff Engineer | New York

10+ years building scalable ML systems. Led teams of 8-12 engineers.
Expert in Python, PyTorch, Kubernetes, and AWS. Deep experience with NLP:
BERT fine-tuning, RAG pipelines with LangChain, vector databases (Pinecone, Weaviate).
AWS Certified ML Specialist. Published 3 papers on efficient transformer training.
Strong communicator and mentor. Experience with Agile and cross-functional collaboration.
"""

result = extract_resume_skills_llm(resume)
print(json.dumps(result, indent=2))

Matching Skills to Job Requirements

from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')

def match_resume_to_job(resume_skills, job_requirements):
    if not resume_skills or not job_requirements:
        return []

    resume_embeddings = model.encode(resume_skills)
    job_embeddings = model.encode(job_requirements)

    matches = []
    for req in job_requirements:
        req_emb = model.encode([req])
        scores = util.cos_sim(req_emb, resume_embeddings)[0]
        best_match_idx = scores.argmax().item()
        best_score = scores[best_match_idx].item()

        matches.append({
            "requirement": req,
            "best_match": resume_skills[best_match_idx],
            "score": round(best_score, 3),
            "met": best_score > 0.7
        })

    return sorted(matches, key=lambda x: -x["score"])

resume_skills = ["Python", "PyTorch", "BERT fine-tuning", "AWS SageMaker", "Docker", "SQL", "Leadership"]
job_requirements = [
    "Deep learning framework experience",
    "Cloud platform deployment",
    "Natural language processing",
    "Container orchestration",
    "Team leadership experience"
]

matches = match_resume_to_job(resume_skills, job_requirements)
print(f"Match rate: {sum(m['met'] for m in matches)}/{len(matches)}")
print()
for m in matches:
    status = "✓" if m["met"] else "✗"
    print(f"{status} [{m['score']:.3f}] {m['requirement']}")
    print(f"       Best match: {m['best_match']}")

Building a Complete Resume Parser

import re
import spacy
from collections import defaultdict

nlp = spacy.load("en_core_web_sm")

SKILL_PATTERNS = {
    "languages": r'\b(python|javascript|typescript|java|go|rust|scala|r\b|c\+\+|ruby)\b',
    "frameworks": r'\b(pytorch|tensorflow|react|fastapi|django|flask|nextjs|vue|angular)\b',
    "nlp_tools": r'\b(bert|gpt|transformers|spacy|langchain|llamaindex|hugging face|nltk)\b',
    "infrastructure": r'\b(docker|kubernetes|aws|gcp|azure|terraform|ansible)\b',
    "databases": r'\b(sql|postgresql|mysql|mongodb|redis|elasticsearch|pinecone)\b',
}

def parse_resume(text):
    text_lower = text.lower()
    doc = nlp(text)

    extracted = defaultdict(list)
    for category, pattern in SKILL_PATTERNS.items():
        matches = re.findall(pattern, text_lower)
        extracted[category] = list(set(matches))

    people = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
    orgs = [ent.text for ent in doc.ents if ent.label_ == "ORG"]
    dates = [ent.text for ent in doc.ents if ent.label_ == "DATE"]

    years_match = re.search(r'(\d+)\+?\s+years?\s+(?:of\s+)?experience', text_lower)
    years = int(years_match.group(1)) if years_match else None

    return {
        "skills": dict(extracted),
        "organizations": list(set(orgs)),
        "experience_years": years,
        "mentioned_people": people,
        "dates": dates
    }