NLP + Traveling Salesman Problem: Extract Cities and Optimize Routes
This project combines two domains: use NLP to extract city names from unstructured text, then solve the Traveling Salesman Problem (TSP) to find the shortest route through those cities.
What Is the Traveling Salesman Problem?
The TSP asks: given a list of cities, what is the shortest route that visits each city exactly once and returns to the starting point? It’s NP-hard — no polynomial-time exact solution exists for large inputs — but excellent heuristics and approximation algorithms work well in practice.
In this project:
- NLP step — extract city names from a travel blog, itinerary, or message
- TSP step — find the optimal or near-optimal route through those cities
Step 1: Extract Cities from Text Using spaCy
import spacy
nlp = spacy.load("en_core_web_sm")
def extract_cities(text): doc = nlp(text) cities = []
for ent in doc.ents: if ent.label_ == "GPE": # Countries, cities, states cities.append(ent.text)
# Deduplicate while preserving order seen = set() unique_cities = [] for city in cities: if city.lower() not in seen: seen.add(city.lower()) unique_cities.append(city)
return unique_cities
travel_text = """I'm planning a European trip starting from London, then heading to Paris for 3 days.From Paris I'll take the train to Amsterdam, then fly to Berlin. After Berlin,I want to visit Vienna and Prague before ending the trip back in London."""
cities = extract_cities(travel_text)print("Extracted cities:", cities)# ['London', 'Paris', 'Amsterdam', 'Berlin', 'Vienna', 'Prague']Step 2: Get Coordinates for Each City
# pip install geopyfrom geopy.geocoders import Nominatimfrom geopy.distance import geodesicimport time
def get_city_coordinates(cities): geolocator = Nominatim(user_agent="nlp-tsp-demo") coordinates = {}
for city in cities: try: location = geolocator.geocode(city) if location: coordinates[city] = (location.latitude, location.longitude) print(f" {city}: ({location.latitude:.3f}, {location.longitude:.3f})") time.sleep(0.5) # Respect Nominatim rate limit except Exception as e: print(f" Could not find: {city} ({e})")
return coordinates
# Use hardcoded coordinates for this example (avoids API dependency)city_coords = { "London": (51.509, -0.118), "Paris": (48.857, 2.347), "Amsterdam": (52.367, 4.904), "Berlin": (52.520, 13.405), "Vienna": (48.208, 16.373), "Prague": (50.075, 14.438),}
def build_distance_matrix(city_coords): cities = list(city_coords.keys()) n = len(cities) matrix = [[0.0] * n for _ in range(n)]
for i, city1 in enumerate(cities): for j, city2 in enumerate(cities): if i != j: dist = geodesic(city_coords[city1], city_coords[city2]).kilometers matrix[i][j] = dist
return cities, matrix
cities_list, dist_matrix = build_distance_matrix(city_coords)print("\nDistance matrix (km) between cities:")header = f"{'':>12}" + "".join(f"{c[:6]:>9}" for c in cities_list)print(header)for i, city in enumerate(cities_list): row = f"{city[:12]:>12}" + "".join(f"{dist_matrix[i][j]:>9.0f}" for j in range(len(cities_list))) print(row)Step 3: Solve TSP with Nearest-Neighbor Heuristic
import numpy as np
def nearest_neighbor_tsp(cities, dist_matrix, start=0): n = len(cities) visited = [False] * n route = [start] visited[start] = True
for _ in range(n - 1): current = route[-1] nearest = None nearest_dist = float('inf')
for j in range(n): if not visited[j] and dist_matrix[current][j] < nearest_dist: nearest = j nearest_dist = dist_matrix[current][j]
route.append(nearest) visited[nearest] = True
route.append(start) # Return to start
# Calculate total distance total = sum(dist_matrix[route[i]][route[i+1]] for i in range(len(route)-1))
return route, total
route_indices, total_km = nearest_neighbor_tsp(cities_list, dist_matrix, start=0)
print("Optimized Route:")for i in range(len(route_indices) - 1): from_city = cities_list[route_indices[i]] to_city = cities_list[route_indices[i+1]] dist = dist_matrix[route_indices[i]][route_indices[i+1]] print(f" {from_city:<15} → {to_city:<15} {dist:>7.0f} km")
print(f"\nTotal distance: {total_km:.0f} km")Step 4: Solve with Google OR-Tools (Better Solution)
# pip install ortoolsfrom ortools.constraint_solver import routing_enums_pb2from ortools.constraint_solver import pywrapcp
def solve_tsp_ortools(cities, dist_matrix): n = len(cities)
# Convert to integer distances (OR-Tools requires integers) int_matrix = [[int(dist_matrix[i][j]) for j in range(n)] for i in range(n)]
manager = pywrapcp.RoutingIndexManager(n, 1, 0) # num_nodes, num_vehicles, depot routing = pywrapcp.RoutingModel(manager)
def distance_callback(from_index, to_index): from_node = manager.IndexToNode(from_index) to_node = manager.IndexToNode(to_index) return int_matrix[from_node][to_node]
transit_callback_index = routing.RegisterTransitCallback(distance_callback) routing.SetArcCostEvaluatorOfAllVehicles(transit_callback_index)
search_params = pywrapcp.DefaultRoutingSearchParameters() search_params.first_solution_strategy = routing_enums_pb2.FirstSolutionStrategy.PATH_CHEAPEST_ARC search_params.local_search_metaheuristic = routing_enums_pb2.LocalSearchMetaheuristic.GUIDED_LOCAL_SEARCH search_params.time_limit.seconds = 10
solution = routing.SolveWithParameters(search_params)
if not solution: return None, None
route = [] index = routing.Start(0) while not routing.IsEnd(index): route.append(manager.IndexToNode(index)) index = solution.Value(routing.NextVar(index)) route.append(manager.IndexToNode(index))
total = solution.ObjectiveValue() return route, total
route, total = solve_tsp_ortools(cities_list, dist_matrix)
if route: print("OR-Tools Optimized Route:") for i in range(len(route) - 1): fc = cities_list[route[i]] tc = cities_list[route[i+1]] dist = dist_matrix[route[i]][route[i+1]] print(f" {fc:<15} → {tc:<15} {dist:>7.0f} km") print(f"Total: {total} km")Full Pipeline: Text → Cities → Optimized Route
import spacy
nlp = spacy.load("en_core_web_sm")
# Predefined coordinates (production: use geocoding API)CITY_COORDS = { "london": (51.509, -0.118), "paris": (48.857, 2.347), "amsterdam": (52.367, 4.904), "berlin": (52.520, 13.405), "vienna": (48.208, 16.373), "prague": (50.075, 14.438), "rome": (41.902, 12.496), "barcelona": (41.385, 2.173), "madrid": (40.416, -3.703), "zurich": (47.377, 8.541),}
def nlp_tsp_pipeline(text): print(f"Input: {text}\n")
# Step 1: Extract cities doc = nlp(text) mentioned = [ent.text.lower() for ent in doc.ents if ent.label_ == "GPE"] known_cities = {c: CITY_COORDS[c] for c in mentioned if c in CITY_COORDS}
if len(known_cities) < 2: return "Need at least 2 recognizable cities."
print(f"Recognized cities: {list(known_cities.keys())}\n")
# Step 2: Distance matrix city_names = list(known_cities.keys()) coords = list(known_cities.values()) n = len(city_names)
from geopy.distance import geodesic matrix = [[geodesic(coords[i], coords[j]).km for j in range(n)] for i in range(n)]
# Step 3: Nearest-neighbor TSP route, total = nearest_neighbor_tsp(city_names, matrix)
print("Optimized route:") for i in range(len(route) - 1): print(f" {city_names[route[i]].title()} → {city_names[route[i+1]].title()}") print(f"Total distance: {total:.0f} km")
text = "I want to visit Paris, Berlin, Amsterdam, Vienna, and Prague starting from London."nlp_tsp_pipeline(text)Extensions
- Constraint-based routing — add time windows, hotel booking constraints
- Real-time traffic — use Google Maps Distance Matrix API instead of geodesic distance
- Multi-city extraction — add more city coordinates or integrate a geocoding API
- Web interface — wrap in a FastAPI or Streamlit app for interactive route planning