Technology  /  NLP

💬 Natural Language Processing 40 guides · updated 2026

From tokenisation and embeddings to transformer-based language understanding — the NLP fundamentals that underpin every modern LLM.

NLP + Traveling Salesman Problem: Extract Cities and Optimize Routes

This project combines two domains: use NLP to extract city names from unstructured text, then solve the Traveling Salesman Problem (TSP) to find the shortest route through those cities.


What Is the Traveling Salesman Problem?

The TSP asks: given a list of cities, what is the shortest route that visits each city exactly once and returns to the starting point? It’s NP-hard — no polynomial-time exact solution exists for large inputs — but excellent heuristics and approximation algorithms work well in practice.

In this project:

  1. NLP step — extract city names from a travel blog, itinerary, or message
  2. TSP step — find the optimal or near-optimal route through those cities

Step 1: Extract Cities from Text Using spaCy

import spacy
nlp = spacy.load("en_core_web_sm")
def extract_cities(text):
doc = nlp(text)
cities = []
for ent in doc.ents:
if ent.label_ == "GPE": # Countries, cities, states
cities.append(ent.text)
# Deduplicate while preserving order
seen = set()
unique_cities = []
for city in cities:
if city.lower() not in seen:
seen.add(city.lower())
unique_cities.append(city)
return unique_cities
travel_text = """
I'm planning a European trip starting from London, then heading to Paris for 3 days.
From Paris I'll take the train to Amsterdam, then fly to Berlin. After Berlin,
I want to visit Vienna and Prague before ending the trip back in London.
"""
cities = extract_cities(travel_text)
print("Extracted cities:", cities)
# ['London', 'Paris', 'Amsterdam', 'Berlin', 'Vienna', 'Prague']

Step 2: Get Coordinates for Each City

# pip install geopy
from geopy.geocoders import Nominatim
from geopy.distance import geodesic
import time
def get_city_coordinates(cities):
geolocator = Nominatim(user_agent="nlp-tsp-demo")
coordinates = {}
for city in cities:
try:
location = geolocator.geocode(city)
if location:
coordinates[city] = (location.latitude, location.longitude)
print(f" {city}: ({location.latitude:.3f}, {location.longitude:.3f})")
time.sleep(0.5) # Respect Nominatim rate limit
except Exception as e:
print(f" Could not find: {city} ({e})")
return coordinates
# Use hardcoded coordinates for this example (avoids API dependency)
city_coords = {
"London": (51.509, -0.118),
"Paris": (48.857, 2.347),
"Amsterdam": (52.367, 4.904),
"Berlin": (52.520, 13.405),
"Vienna": (48.208, 16.373),
"Prague": (50.075, 14.438),
}
def build_distance_matrix(city_coords):
cities = list(city_coords.keys())
n = len(cities)
matrix = [[0.0] * n for _ in range(n)]
for i, city1 in enumerate(cities):
for j, city2 in enumerate(cities):
if i != j:
dist = geodesic(city_coords[city1], city_coords[city2]).kilometers
matrix[i][j] = dist
return cities, matrix
cities_list, dist_matrix = build_distance_matrix(city_coords)
print("\nDistance matrix (km) between cities:")
header = f"{'':>12}" + "".join(f"{c[:6]:>9}" for c in cities_list)
print(header)
for i, city in enumerate(cities_list):
row = f"{city[:12]:>12}" + "".join(f"{dist_matrix[i][j]:>9.0f}" for j in range(len(cities_list)))
print(row)

Step 3: Solve TSP with Nearest-Neighbor Heuristic

import numpy as np
def nearest_neighbor_tsp(cities, dist_matrix, start=0):
n = len(cities)
visited = [False] * n
route = [start]
visited[start] = True
for _ in range(n - 1):
current = route[-1]
nearest = None
nearest_dist = float('inf')
for j in range(n):
if not visited[j] and dist_matrix[current][j] < nearest_dist:
nearest = j
nearest_dist = dist_matrix[current][j]
route.append(nearest)
visited[nearest] = True
route.append(start) # Return to start
# Calculate total distance
total = sum(dist_matrix[route[i]][route[i+1]] for i in range(len(route)-1))
return route, total
route_indices, total_km = nearest_neighbor_tsp(cities_list, dist_matrix, start=0)
print("Optimized Route:")
for i in range(len(route_indices) - 1):
from_city = cities_list[route_indices[i]]
to_city = cities_list[route_indices[i+1]]
dist = dist_matrix[route_indices[i]][route_indices[i+1]]
print(f" {from_city:<15}{to_city:<15} {dist:>7.0f} km")
print(f"\nTotal distance: {total_km:.0f} km")

Step 4: Solve with Google OR-Tools (Better Solution)

# pip install ortools
from ortools.constraint_solver import routing_enums_pb2
from ortools.constraint_solver import pywrapcp
def solve_tsp_ortools(cities, dist_matrix):
n = len(cities)
# Convert to integer distances (OR-Tools requires integers)
int_matrix = [[int(dist_matrix[i][j]) for j in range(n)] for i in range(n)]
manager = pywrapcp.RoutingIndexManager(n, 1, 0) # num_nodes, num_vehicles, depot
routing = pywrapcp.RoutingModel(manager)
def distance_callback(from_index, to_index):
from_node = manager.IndexToNode(from_index)
to_node = manager.IndexToNode(to_index)
return int_matrix[from_node][to_node]
transit_callback_index = routing.RegisterTransitCallback(distance_callback)
routing.SetArcCostEvaluatorOfAllVehicles(transit_callback_index)
search_params = pywrapcp.DefaultRoutingSearchParameters()
search_params.first_solution_strategy = routing_enums_pb2.FirstSolutionStrategy.PATH_CHEAPEST_ARC
search_params.local_search_metaheuristic = routing_enums_pb2.LocalSearchMetaheuristic.GUIDED_LOCAL_SEARCH
search_params.time_limit.seconds = 10
solution = routing.SolveWithParameters(search_params)
if not solution:
return None, None
route = []
index = routing.Start(0)
while not routing.IsEnd(index):
route.append(manager.IndexToNode(index))
index = solution.Value(routing.NextVar(index))
route.append(manager.IndexToNode(index))
total = solution.ObjectiveValue()
return route, total
route, total = solve_tsp_ortools(cities_list, dist_matrix)
if route:
print("OR-Tools Optimized Route:")
for i in range(len(route) - 1):
fc = cities_list[route[i]]
tc = cities_list[route[i+1]]
dist = dist_matrix[route[i]][route[i+1]]
print(f" {fc:<15}{tc:<15} {dist:>7.0f} km")
print(f"Total: {total} km")

Full Pipeline: Text → Cities → Optimized Route

import spacy
nlp = spacy.load("en_core_web_sm")
# Predefined coordinates (production: use geocoding API)
CITY_COORDS = {
"london": (51.509, -0.118), "paris": (48.857, 2.347),
"amsterdam": (52.367, 4.904), "berlin": (52.520, 13.405),
"vienna": (48.208, 16.373), "prague": (50.075, 14.438),
"rome": (41.902, 12.496), "barcelona": (41.385, 2.173),
"madrid": (40.416, -3.703), "zurich": (47.377, 8.541),
}
def nlp_tsp_pipeline(text):
print(f"Input: {text}\n")
# Step 1: Extract cities
doc = nlp(text)
mentioned = [ent.text.lower() for ent in doc.ents if ent.label_ == "GPE"]
known_cities = {c: CITY_COORDS[c] for c in mentioned if c in CITY_COORDS}
if len(known_cities) < 2:
return "Need at least 2 recognizable cities."
print(f"Recognized cities: {list(known_cities.keys())}\n")
# Step 2: Distance matrix
city_names = list(known_cities.keys())
coords = list(known_cities.values())
n = len(city_names)
from geopy.distance import geodesic
matrix = [[geodesic(coords[i], coords[j]).km for j in range(n)] for i in range(n)]
# Step 3: Nearest-neighbor TSP
route, total = nearest_neighbor_tsp(city_names, matrix)
print("Optimized route:")
for i in range(len(route) - 1):
print(f" {city_names[route[i]].title()}{city_names[route[i+1]].title()}")
print(f"Total distance: {total:.0f} km")
text = "I want to visit Paris, Berlin, Amsterdam, Vienna, and Prague starting from London."
nlp_tsp_pipeline(text)

Extensions