NLP Lab Programs
1. Write a Python Program to perform following tasks on text
a) Tokenization b) Stop word Removal
Program:
import nltk
from [Link] import word_tokenize
from [Link] import stopwords
# Download required NLTK data
[Link]('punkt')
[Link]('punkt_tab') # fix for latest nltk versions
[Link]('stopwords')
text = "Natural Language Processing helps computers understand human
language."
# Tokenization
tokens = word_tokenize(text)
print("Tokens:", tokens)
# Stopword Removal
stop_words = set([Link]('english'))
filtered = [w for w in tokens if [Link]() not in stop_words]
print("Filtered Tokens:", filtered)
2. Write a Python program to implement Porter stemmer algorithm for stemming
Program:
# ------------------------------------------------------------
# Porter Stemmer Implementation using NLTK
# ------------------------------------------------------------
import nltk
from [Link] import PorterStemmer
from [Link] import word_tokenize
# Download punkt (needed for tokenization)
[Link]('punkt')
[Link]('punkt_tab') # for latest versions
# Sample text
text = "The children are playing happily in the garden while their
parents are watching them."
# Tokenization
tokens = word_tokenize(text)
# Create Porter Stemmer object
ps = PorterStemmer()
# Apply stemming
stemmed_words = [[Link](word) for word in tokens]
print("Original Tokens:", tokens)
print("Stemmed Words:", stemmed_words)
3. Write Python Program for a) Word Analysis b) Word Generation
Program:
A.
# Simple Word Analysis Program
prefixes = ["un", "re", "in", "im", "dis"]
suffixes = ["ing", "ed", "s", "er", "ness", "ly"]
word = input("Enter a word: ")
found_prefix = ""
found_suffix = ""
# Check prefix
for p in prefixes:
if [Link](p):
found_prefix = p
break
# Check suffix
for s in suffixes:
if [Link](s):
found_suffix = s
break
# Find root
root = word
if found_prefix:
root = root[len(found_prefix):]
if found_suffix:
root = root[:-len(found_suffix)]
print("\n--- Word Analysis ---")
print("Prefix :", found_prefix if found_prefix else "None")
print("Root :", root)
print("Suffix :", found_suffix if found_suffix else "None")
B.
# Simple Word Generation Program
prefixes = ["un", "re", "dis"]
suffixes = ["ing", "ed", "s", "er"]
root = input("Enter root word: ")
generated_words = []
# Add prefixes
for p in prefixes:
generated_words.append(p + root)
# Add suffixes
for s in suffixes:
generated_words.append(root + s)
print("\n--- Generated Words ---")
for w in generated_words:
print(w)
4. Create a Sample list for at least 5 words with ambiguous sense and Write a Python program to
implement WSD
Program:
"""
WSD using NLTK + WordNet (Lesk-like overlap method).
- Requires: nltk
- Downloads WordNet data automatically if missing.
"""
import nltk
from [Link] import wordnet as wn
from [Link] import stopwords
from [Link] import PorterStemmer
from [Link] import word_tokenize
import string
'''# Ensure required NLTK data is available
nltk_packages = ["wordnet", "omw-1.4", "punkt", "stopwords"]
for pkg in nltk_packages:
try:
[Link](pkg)
except LookupError:
[Link](pkg)'''
[Link](‘wordnet’)
[Link]('punkt')
[Link]('punkt_tab') # fix for latest nltk versions
[Link]('stopwords')
STOP = set([Link]("english"))
PUNCT = set([Link])
STEMMER = PorterStemmer()
def normalize_tokens(text):
"""Tokenize, lowercase, remove stopwords/punctuation, and stem."""
tokens = word_tokenize([Link]())
clean = []
for t in tokens:
if t in PUNCT:
continue
if t in STOP:
continue
if len(t) < 2:
continue
[Link]([Link](t))
return clean
def synset_signature(syn):
"""
Build a signature (list of normalized tokens) for a synset.
Use: definition, examples, lemma names, and immediate hypernyms'
lemma names.
"""
sig = []
# definition
sig += normalize_tokens([Link]())
# examples
for ex in [Link]():
sig += normalize_tokens(ex)
# lemma names (split underscores)
for lemma in [Link]():
for part in [Link]().split("_"):
if part:
[Link]([Link]([Link]()))
# add hypernyms' lemma names to give some extra context
for hyper in [Link]():
for lemma in [Link]():
for part in [Link]().split("_"):
[Link]([Link]([Link]()))
return set(sig)
def lesk_wsd(target_word, sentence):
"""
Lesk-like WSD for target_word in sentence.
Returns best synset or None.
"""
# tokens from context (normalized)
context = normalize_tokens(sentence)
if not context:
return None
# Get candidate synsets for the target
candidates = [Link](target_word)
if not candidates:
return None
best_syn = None
best_score = -1
for syn in candidates:
signature = synset_signature(syn)
# overlap between context and signature
score = len(set(context) & signature)
# tie-breaker: prefer synset with higher frequency_key
(heuristic)
if score > best_score:
best_score = score
best_syn = syn
return best_syn, best_score
# --- Sample list of ambiguous words and demo sentences ---
SAMPLES = [
("bank", "I deposited my paycheck at the bank yesterday."),
("bank", "The canoe was pulled up on the muddy bank of the river."),
("bat", "A bat flew out of the cave at dusk."),
("bat", "He gripped the cricket bat and ran to the crease."),
("plant", "The power plant was shut down after the accident."),
("plant", "She put the new plant on the balcony and watered it."),
("lead", "Old pipes often contain lead which is harmful."),
("lead", "She will lead the project next month."),
("bass", "He likes to fish for bass in the lake."),
("bass", "Turn up the bass on that song; I love the low end."),
]
if __name__ == "__main__":
print("NLTK + WordNet Lesk-style WSD Demo\n" + "-"*36)
for word, sent in SAMPLES:
result = lesk_wsd(word, sent)
if result is None or result[0] is None:
print(f"Word: {word}\n Sentence: {sent}\n -> No sense
found.\n")
continue
syn, score = result
# present info
print(f"Word: {word}\n Sentence: {sent}")
print(f" -> Predicted synset: {[Link]()} (score={score})")
print(f" Definition : {[Link]()}")
print(f" Examples : {[Link]()}")
print(f" Lemmas : {', '.join(syn.lemma_names())}\n")
Easy:
import nltk
from [Link] import lesk
from [Link] import word_tokenize
# download once (uncomment if needed)
# [Link]("wordnet"); [Link]("punkt")
sentence = input("Enter sentence: ")
word = input("Enter ambiguous word: ")
sense = lesk(word_tokenize(sentence), word)
if sense:
print("Predicted sense:", [Link]())
print("Definition:", [Link]())
else:
print("No sense found.")
Best:
# Sample list of ambiguous words with multiple meanings
ambiguous_words = {
"bank": {
"sense1": "A financial institution that handles money and
provides financial services.",
"sense2": "The side of a river or a stream."
},
"bat": {
"sense1": "A flying mammal.",
"sense2": "A piece of equipment used in sports like baseball to
hit the ball."
},
"bark": {
"sense1": "The outer covering of a tree.",
"sense2": "The sound made by a dog."
},
"match": {
"sense1": "A competition or game.",
"sense2": "A device used to start a fire."
},
"bore": {
"sense1": "A person or thing that is dull and uninteresting.",
"sense2": "To make a hole in something using a tool."
}
}
# Function to perform Word Sense Disambiguation
def disambiguate_word(word, context):
context = [Link]()
if word not in ambiguous_words:
return "Word not found in ambiguous words list."
senses = ambiguous_words[word]
if "river" in context or "water" in context:
sense = senses["sense2"]
elif "money" in context or "financial" in context:
sense = senses["sense1"]
elif "dog" in context:
sense = senses["sense2"]
elif "sport" in context or "hit" in context or "ball" in context:
sense = senses["sense2"]
elif "dull" in context or "boring" in context:
sense = senses["sense1"]
elif "hole" in context or "drill" in context:
sense = senses["sense2"]
else:
return "Could not determine the sense of the word based on the
context."
return f"The word '{word}' in context '{context}' refers to: {sense}"
context1 = "The bank is located on the river."
context2 = "I need to go to the bank to withdraw some money."
context3 = "The dog barked loudly in the yard."
context4 = "He hit the ball with a bat during the game."
context5 = "The movie was so boring, I was about to fall asleep."
context6 = "We need to bore a hole into the wall."
print(disambiguate_word("bank", context1))
print(disambiguate_word("bank", context2))
print(disambiguate_word("bark", context3))
print(disambiguate_word("bat", context4))
print(disambiguate_word("bore", context5))
print(disambiguate_word("bore", context6))
5. Install NLTK tool kit and perform stemming
Program:
import nltk
from [Link] import PorterStemmer
from [Link] import word_tokenize
# Initialize the stemmer
stemmer = PorterStemmer()
# List of words to stem
words = ["running", "flies", "easily", "fairly", "crying", "happiness",
"playing"]
# Apply stemming and display results
for word in words:
print(f"Original: {word} --> Stemmed: {[Link](word)}")
6. Create Sample list of at least 10 words POS tagging and find the POS for any given word
Program;
# POS tagging without NLTK data (exam-safe)
pos_dictionary = {
"run": "VB",
"beautiful": "JJ",
"quickly": "RB",
"computer": "NN",
"play": "VB",
"jump": "VB",
"happy": "JJ",
"india": "NNP",
"walked": "VBD",
"singing": "VBG",
"teacher": "NN",
"dogs": "NNS"
}
# Sample words
words = [
"run", "beautiful", "quickly", "computer", "play",
"jump", "happy", "India", "walked", "singing",
"teacher", "dogs"
]
print("POS Tags for the Sample Words:")
for w in words:
tag = pos_dictionary.get([Link](), "NN")
print(f"{w} → {tag}")
# User input
word = input("\nEnter a word to find its POS: ").strip()
print(f"The POS tag for '{word}' is: {pos_dictionary.get([Link](),
'NN')}")
7. Write a Python program to
a) Perform Morphological Analysis using NLTK library
b) Generate n-grams using NLTK N-Grams library
c) Implement N-Grams Smoothing
a) Perform Morphological Analysis using NLTK library
# morph_analysis_simple.py
# Simple morphological analysis using NLTK
import nltk
from [Link] import word_tokenize
from nltk import pos_tag
from [Link] import PorterStemmer
from [Link] import WordNetLemmatizer
from [Link] import wordnet
# downloads (safe to call repeatedly)
[Link]('punkt', quiet=True)
[Link]('averaged_perceptron_tagger', quiet=True)
[Link]('wordnet', quiet=True)
[Link]('omw-1.4', quiet=True)
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()
def pos_to_wordnet_tag(tag):
if [Link]('J'):
return [Link]
if [Link]('V'):
return [Link]
if [Link]('N'):
return [Link]
if [Link]('R'):
return [Link]
return None
def analyze(text):
tokens = word_tokenize(text)
tagged = pos_tag(tokens)
results = []
for tok, tag in tagged:
wn_tag = pos_to_wordnet_tag(tag)
lemma = [Link](tok, wn_tag) if wn_tag else [Link](tok)
[Link]({
'token': tok,
'pos': tag,
'porter_stem': [Link](tok),
'lemma': lemma
})
return results
# demo
if __name__ == "__main__":
s = "The runners were running quickly toward the finishing line."
for item in analyze(s):
print(item)
b) ) Generate n-grams using NLTK N-Grams library
# generate_ngrams_simple.py
# Simple n-gram generation using NLTK
import nltk
from [Link] import word_tokenize
from [Link] import ngrams
from collections import Counter
[Link]('punkt', quiet=True)
def generate_ngrams(text, n=2, pad=False):
tokens = word_tokenize(text)
if pad:
tokens = ['<s>']*(n-1) + tokens + ['</s>']*(n-1)
grams = list(ngrams(tokens, n))
freq = Counter(grams)
return grams, freq
# demo
if __name__ == "__main__":
text = "I love natural language processing and I love coding."
for n in (1,2,3):
grams, freq = generate_ngrams(text, n=n, pad=True)
print(f"\nTop {n}-grams (first 10):", grams[:10])
print("Most common:", freq.most_common(5))
c) Implement N-Grams Smoothing
# bigram_addk_simple.py
# Simple bigram model with add-k smoothing
import nltk
from [Link] import word_tokenize
from collections import Counter, defaultdict
[Link]('punkt', quiet=True)
class BigramAddKSmoother:
def __init__(self, k=1.0):
self.k = k
self.unigram_counts = Counter()
self.bigram_counts = Counter()
[Link] = set()
self.total_unigrams = 0
def fit(self, texts):
for text in texts:
tokens = [[Link]() for t in word_tokenize(text)]
self.total_unigrams += len(tokens)
self.unigram_counts.update(tokens)
self.bigram_counts.update(zip(tokens, tokens[1:]))
[Link](tokens)
def prob(self, w1, w2):
""" P(w2 | w1) with add-k smoothing """
w1 = [Link](); w2 = [Link]()
V = len([Link])
count_bigram = self.bigram_counts.get((w1, w2), 0)
count_w1 = self.unigram_counts.get(w1, 0)
denom = count_w1 + self.k * V
return (count_bigram + self.k) / denom if denom > 0 else 0.0
# demo
if __name__ == "__main__":
corpus = [
"I love natural language processing",
"I love coding in Python",
"Natural language processing is fun",
"Python makes coding easy"
]
model = BigramAddKSmoother(k=1.0) # k=1 -> Laplace
[Link](corpus)
pairs = [("i","love"), ("love","natural"), ("natural","language"), ("python","makes"),
("unknown","word")]
for a,b in pairs:
print(f"P({b}|{a}) = {[Link](a,b):.4f}")
8. Using NLTK package to convert audio file to text and text file to audio files
A) Text to audio
Program:
from gtts import gTTS
# read text from file
with open("[Link]", "r") as f:
text = [Link]()
# convert to speech
tts = gTTS(text=text, lang='en')
[Link]("output.mp3")
print("Audio saved as output.mp3")
B. Audio to Text
Proram:
import speech_recognition as sr
r = [Link]()
with [Link]("[Link]") as source:
audio = [Link](source)
text = r.recognize_google(audio)
print("Text:", text)