import pandas as pd
df=pd.read_csv('[Link]',sep='\t',names=['label','text'])
df
[Link]
!pip install nltk
import nltk
[Link]('stopwords')
sent = 'Hello friends! How are you?'
from [Link] import word_tokenize
word_tokenize(sent)
from [Link] import stopwords
swords=[Link]('english')
swords
clean=[word for word in word_tokenize(sent) if word not in swords]
clean
from [Link] import PorterStemmer
ps=PorterStemmer()
clean=[[Link](word) for word in word_tokenize(sent)
if word not in swords]
clean
sent='Hello friends! How are you? we will be learning python today.'
def clean_text(sent):
tokens=word_tokenize(sent)
clean=[word for word in tokens
if [Link]() or [Link]()]
clean=[[Link](word) for word in clean
if word not in swords]
return clean
clean_text(sent)
# pre-processing
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(analyzer=clean_text)
x = df['text']
y = df['label']
x_new=tfidf.fit_transform(x)
[Link]
x_new.shape
tfidf.get_feature_names()
y.value_counts()
#cross validation
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(
x_new,y,random_state=0,test_size=0.25)
x_train.shape
x_test.shape
from sklearn.naive_bayes import GaussianNB
nb=GaussianNB()
[Link](x_train.toarray(),y_train)
y_pred=[Link](x_test.toarray())
y_test.value_counts()
from [Link] import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_predictions(y_test,y_pred)
from [Link] import accuracy_score, classification_report
print(classification_report(y_test,y_pred))
accuracy_score(y_test,y_pred)
from [Link] import RandomForestClassifier
rf=RandomForestClassifier(random_state=0)
[Link](x_train,y_train)
y_pred=[Link](x_test)
ConfusionMatrixDisplay.from_predictions(y_test,y_pred)
print(classification_report(y_test,y_pred))
accuracy_score(y_test,y_pred)
from sklearn.linear_model import LogisticRegression
log=LogisticRegression()
[Link](x_train,y_train)
y_pred=[Link](x_test)
accuracy_score(y_test,y_pred)
#hyer parameter tuning
from sklearn.model_selection import GridSearchCV
params={'criterion':['gini','entropy'],
'max_features':['sqrt','log2'],
'random_state':[0,1,2,3,4],
'class_weight':['balanced','balanced_subsample']
}
grid = GridSearchCV(rf,param_grid=params,cv=5,scoring='accuracy')
[Link](x_train,y_train)
[Link](x_train,y_train)
y_pred=[Link](x_test)
accuracy_score(y_test,y_pred)