Noise Removal in NLP

Noise is anything in your text that doesn’t contribute to meaning — HTML markup, URLs, emoji, boilerplate footers, encoding artifacts, and duplicate whitespace. Removing it before modeling ensures the signal-to-noise ratio is high enough for your pipeline to learn from.

Types of Noise in Text Data

Noise Type	Example	Impact
HTML/XML tags	`<p>`, `<br/>`, `&`	Pollutes token list
URLs	`https://example.com/page?id=123`	Rare tokens, no semantic value
Email addresses	`user@example.com`	PII risk, rarely informative
Special characters	`©`, `™`, `★`, `→`	Encoding issues
Emoji	`😊🔥💯`	May carry sentiment (keep for sentiment tasks)
Repeated characters	`noooooo`, `!!!!!!`	Non-standard tokens
Boilerplate	”Copyright 2025. All rights reserved.”	Adds noise, hurts topics
Encoding artifacts	`â€œ`, `Ã©`, `\xa0`	Break tokenization
Duplicate whitespace	`"Hello World"`	Creates empty tokens

Removing HTML

from bs4 import BeautifulSoup

def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text(separator=" ")

html = """
<div class="article">
  <h1>Transformers in NLP</h1>
  <p>The <strong>transformer</strong> architecture has <em>revolutionized</em> NLP.</p>
  <a href="/more">Read more</a>
</div>
"""
print(strip_html(html))
# "Transformers in NLP The transformer architecture has revolutionized NLP. Read more"

Removing URLs and Email Addresses

import re

def remove_urls(text):
    pattern = r'https?://\S+|www\.\S+|ftp://\S+'
    return re.sub(pattern, '', text)

def remove_emails(text):
    return re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)

text = "Contact us at support@company.com or visit https://company.com/help for assistance."
cleaned = remove_emails(remove_urls(text))
print(cleaned)
# "Contact us at  or visit  for assistance."

# Clean up extra spaces afterward
cleaned = re.sub(r'\s+', ' ', cleaned).strip()
print(cleaned)
# "Contact us at or visit for assistance."

Removing Special Characters and Punctuation

import re

def remove_special_chars(text, keep_punctuation=False):
    if keep_punctuation:
        # Keep only letters, numbers, whitespace, and common punctuation
        return re.sub(r'[^a-zA-Z0-9\s.,!?\'"-]', '', text)
    else:
        # Keep only letters, numbers, and whitespace
        return re.sub(r'[^a-zA-Z0-9\s]', '', text)

text = "The model achieved 94.5% accuracy on the GLUE benchmark! © 2025 AI Lab™"

print(remove_special_chars(text, keep_punctuation=True))
# "The model achieved 94.5 accuracy on the GLUE benchmark! 2025 AI Lab"

print(remove_special_chars(text, keep_punctuation=False))
# "The model achieved 945 accuracy on the GLUE benchmark  2025 AI Lab"

Handling Emoji

import emoji
import re

def remove_emoji(text):
    return emoji.replace_emoji(text, replace='')

def replace_emoji_with_meaning(text):
    return emoji.demojize(text, delimiters=(" ", " "))

text = "I love 🤖 NLP! This model is 🔥 amazing! 😊"

print(remove_emoji(text))
# "I love  NLP! This model is  amazing! "

print(replace_emoji_with_meaning(text))
# "I love  robot  NLP! This model is  fire  amazing!  smiling face with smiling eyes "

For sentiment analysis, replacing emoji with their descriptions often improves accuracy — a 🔥 is positive, a 😡 is negative.

Handling Encoding Artifacts

import ftfy

def fix_encoding(text):
    return ftfy.fix_text(text)

# Common encoding corruption patterns
bad_texts = [
    "The café was â€œgoodâ€\x9d according to reviews.",  # Mojibake
    "It’s a nice day… isn’t it?",         # Smart quotes
    "Price: \xa3100 or €120",                        # Currency symbols
]

for text in bad_texts:
    print(f"Before: {repr(text)}")
    print(f"After:  {fix_encoding(text)}\n")

Social media introduces unique noise: hashtags, mentions, slang, repeated characters, and mixed casing:

import re

def clean_social_media(text):
    text = re.sub(r'@\w+', '', text)              # Remove mentions
    text = re.sub(r'#(\w+)', r'\1', text)         # Keep hashtag words, drop #
    text = re.sub(r'http\S+', '', text)           # Remove URLs
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)   # "noooo" → "noo" (max 2 repeats)
    text = re.sub(r'[^a-zA-Z0-9\s.,!?\'"]', ' ', text)  # Remove noise
    text = re.sub(r'\s+', ' ', text).strip()
    return text

tweet = "@user1 This model is sooooo goood!!! Check https://t.co/abc #NLP #AI 🔥🔥🔥"
print(clean_social_media(tweet))
# "This model is soo good NLP AI"

Complete Noise Removal Pipeline

import re
from bs4 import BeautifulSoup
import unicodedata

def clean_text(text,
               remove_html=True,
               remove_urls=True,
               remove_emails=True,
               fix_encoding_issues=True,
               lowercase=False):

    if remove_html:
        text = BeautifulSoup(text, "html.parser").get_text(separator=" ")

    if remove_urls:
        text = re.sub(r'https?://\S+|www\.\S+', '', text)

    if remove_emails:
        text = re.sub(r'\b[\w.+-]+@[\w-]+\.[a-z]{2,}\b', '', text)

    if fix_encoding_issues:
        text = unicodedata.normalize('NFKC', text)

    # Remove non-printable characters
    text = re.sub(r'[^\x20-\x7E\s]', ' ', text)

    # Collapse whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    if lowercase:
        text = text.lower()

    return text

raw = """<p>Check out our latest <b>AI research</b> at https://lab.example.com!
Contact info@lab.example.com â€" we’d love to hear from you.</p>"""

print(clean_text(raw))
# "Check out our latest AI research at! Contact we'd love to hear from you."

What NOT to Remove

Some “noise” carries signal:

Numbers — financial documents need figures preserved
Punctuation — sentiment and question detection rely on ! and ?
Emoji — positive/negative sentiment in social media analysis
Capitalization — NER needs “Apple” (company) vs “apple” (fruit)

Always define noise relative to your task. What is noise for topic modeling may be signal for sentiment analysis.