Tokenaization
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize,word_tokenize
text='nlp is a field of CSE and AI'
print(sent_tokenize(text))
print(word_tokenize(text))
Stopwords in NLP
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download("stopwords")
text = "This is an example demonstrating stopwords removal in NLP."
words = word_tokenize(text)
stop_words = set(stopwords.words("english"))
filtered = [w for w in words if w.lower() not in stop_words]
print("Filtered Words:", filtered)
stemming
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
text = "Players are playing and studied various topics"
words = word_tokenize(text)
print([stemmer.stem(w) for w in words])
Lemmtization
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')
nltk.download('wordnet')
lemmatizer=WordNetLemmatizer()
text="He is running and they were eating apples"
words=word_tokenize(text)
lemmatized_words=[lemmatizer.lemmatize(w,pos="v") for w in words]
print("original:",words)
print("lemmatized:",lemmatized_words)
WordNet
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet
synonyms=[]
for syn in wordnet.synsets("clever"):
for lemma in syn.lemmas():
synonyms.append(lemma.name())
print(set(synonyms))
N-gram
from nltk import ngrams
text = "Natural Language Processing is a intresting subject"
words = text.split()
print("Bigrams:", list(ngrams(words, 2)))
print("Trigrams:", list(ngrams(words, 3)))
POS tagging
import nltk
text = nltk.word_tokenize("The quick brown fox jumps over the lazy dog")
pos_tags = nltk.pos_tag(text)
print(pos_tags)