Commit 9cd92268 authored by Pinky Sabu's avatar Pinky Sabu

uploading files

parent 63662436
This diff is collapsed.
{
"cells": [
{
"cell_type": "code",
"execution_count": 19,
"id": "46ba8b93",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.preprocessing import LabelEncoder\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn.tree import plot_tree\n",
"from sklearn.metrics import accuracy_score,confusion_matrix\n",
"from sklearn.model_selection import train_test_split"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "b8b365ef",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>mean_radius</th>\n",
" <th>mean_texture</th>\n",
" <th>mean_perimeter</th>\n",
" <th>mean_area</th>\n",
" <th>mean_smoothness</th>\n",
" <th>diagnosis</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>17.99</td>\n",
" <td>10.38</td>\n",
" <td>122.80</td>\n",
" <td>1001.0</td>\n",
" <td>0.11840</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>20.57</td>\n",
" <td>17.77</td>\n",
" <td>132.90</td>\n",
" <td>1326.0</td>\n",
" <td>0.08474</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>19.69</td>\n",
" <td>21.25</td>\n",
" <td>130.00</td>\n",
" <td>1203.0</td>\n",
" <td>0.10960</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>11.42</td>\n",
" <td>20.38</td>\n",
" <td>77.58</td>\n",
" <td>386.1</td>\n",
" <td>0.14250</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>20.29</td>\n",
" <td>14.34</td>\n",
" <td>135.10</td>\n",
" <td>1297.0</td>\n",
" <td>0.10030</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" mean_radius mean_texture mean_perimeter mean_area mean_smoothness \\\n",
"0 17.99 10.38 122.80 1001.0 0.11840 \n",
"1 20.57 17.77 132.90 1326.0 0.08474 \n",
"2 19.69 21.25 130.00 1203.0 0.10960 \n",
"3 11.42 20.38 77.58 386.1 0.14250 \n",
"4 20.29 14.34 135.10 1297.0 0.10030 \n",
"\n",
" diagnosis \n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data=pd.read_csv(\"Breast_cancer_data_kaggle.csv\")\n",
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "9d6deb44",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"mean_radius 0\n",
"mean_texture 0\n",
"mean_perimeter 0\n",
"mean_area 0\n",
"mean_smoothness 0\n",
"diagnosis 0\n",
"dtype: int64"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "85559b70",
"metadata": {},
"outputs": [],
"source": [
"clf=MultinomialNB()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "9dfd38f1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1,\n",
" 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,\n",
" 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0,\n",
" 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,\n",
" 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1,\n",
" 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,\n",
" 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1,\n",
" 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1])"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X=data.loc[:,'mean_radius':]\n",
"y=data['diagnosis']\n",
"x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=.3)\n",
"clf.fit(x_train,y_train)\n",
"y_pred=clf.predict(x_test)\n",
"y_pred"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "65aeb3a2",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[54, 14],\n",
" [ 4, 99]])"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"confusion_matrix(y_test,y_pred)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "bb3dd22d",
"metadata": {},
"outputs": [
{
"ename": "AttributeError",
"evalue": "'numpy.ndarray' object has no attribute 'plot'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/var/folders/m7/36jtw44947q44fbpkfbr42_00000gr/T/ipykernel_63966/1064681650.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0my_pred\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mplot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mAttributeError\u001b[0m: 'numpy.ndarray' object has no attribute 'plot'"
]
}
],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
{
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"id": "6769f29e",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.preprocessing import LabelEncoder\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import accuracy_score,confusion_matrix\n",
"from sklearn.tree import plot_tree"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "1576c085",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>mean_radius</th>\n",
" <th>mean_texture</th>\n",
" <th>mean_perimeter</th>\n",
" <th>mean_area</th>\n",
" <th>mean_smoothness</th>\n",
" <th>diagnosis</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>17.99</td>\n",
" <td>10.38</td>\n",
" <td>122.80</td>\n",
" <td>1001.0</td>\n",
" <td>0.11840</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>20.57</td>\n",
" <td>17.77</td>\n",
" <td>132.90</td>\n",
" <td>1326.0</td>\n",
" <td>0.08474</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>19.69</td>\n",
" <td>21.25</td>\n",
" <td>130.00</td>\n",
" <td>1203.0</td>\n",
" <td>0.10960</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>11.42</td>\n",
" <td>20.38</td>\n",
" <td>77.58</td>\n",
" <td>386.1</td>\n",
" <td>0.14250</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>20.29</td>\n",
" <td>14.34</td>\n",
" <td>135.10</td>\n",
" <td>1297.0</td>\n",
" <td>0.10030</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" mean_radius mean_texture mean_perimeter mean_area mean_smoothness \\\n",
"0 17.99 10.38 122.80 1001.0 0.11840 \n",
"1 20.57 17.77 132.90 1326.0 0.08474 \n",
"2 19.69 21.25 130.00 1203.0 0.10960 \n",
"3 11.42 20.38 77.58 386.1 0.14250 \n",
"4 20.29 14.34 135.10 1297.0 0.10030 \n",
"\n",
" diagnosis \n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data=pd.read_csv(\"Breast_cancer_data_kaggle.csv\")\n",
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "c4394780",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"mean_radius 0\n",
"mean_texture 0\n",
"mean_perimeter 0\n",
"mean_area 0\n",
"mean_smoothness 0\n",
"diagnosis 0\n",
"dtype: int64"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "2000f0aa",
"metadata": {},
"outputs": [],
"source": [
"clf=RandomForestClassifier()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "89e47e80",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 56, 0],\n",
" [ 0, 115]])"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X=data.loc[:,'mean_radius':]\n",
"y=data['diagnosis']\n",
"X.head()\n",
"x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=.3)\n",
"clf.fit(x_train,y_train)\n",
"y_pred=clf.predict(x_test)\n",
"confusion_matrix(y_test,y_pred)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
{
"cells": [
{
"cell_type": "code",
"execution_count": 39,
"id": "ab25a671",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn.preprocessing import LabelEncoder\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import accuracy_score,confusion_matrix"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "60224855",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>v1</th>\n",
" <th>v2</th>\n",
" <th>Unnamed: 2</th>\n",
" <th>Unnamed: 3</th>\n",
" <th>Unnamed: 4</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ham</td>\n",
" <td>Go until jurong point, crazy.. Available only ...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>ham</td>\n",
" <td>Ok lar... Joking wif u oni...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>spam</td>\n",
" <td>Free entry in 2 a wkly comp to win FA Cup fina...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>ham</td>\n",
" <td>U dun say so early hor... U c already then say...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>ham</td>\n",
" <td>Nah I don't think he goes to usf, he lives aro...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" v1 v2 Unnamed: 2 \\\n",
"0 ham Go until jurong point, crazy.. Available only ... NaN \n",
"1 ham Ok lar... Joking wif u oni... NaN \n",
"2 spam Free entry in 2 a wkly comp to win FA Cup fina... NaN \n",
"3 ham U dun say so early hor... U c already then say... NaN \n",
"4 ham Nah I don't think he goes to usf, he lives aro... NaN \n",
"\n",
" Unnamed: 3 Unnamed: 4 \n",
"0 NaN NaN \n",
"1 NaN NaN \n",
"2 NaN NaN \n",
"3 NaN NaN \n",
"4 NaN NaN "
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data=pd.read_csv('spam.csv',encoding='latin-1')\n",
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "5d3c9849",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"v1 0\n",
"v2 0\n",
"Unnamed: 2 5522\n",
"Unnamed: 3 5560\n",
"Unnamed: 4 5566\n",
"dtype: int64"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "5d0726b5",
"metadata": {},
"outputs": [],
"source": [
"data.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True,axis=1)\n"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "7953e2f6",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>v1</th>\n",
" <th>v2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ham</td>\n",
" <td>Go until jurong point, crazy.. Available only ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>ham</td>\n",
" <td>Ok lar... Joking wif u oni...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>spam</td>\n",
" <td>Free entry in 2 a wkly comp to win FA Cup fina...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>ham</td>\n",
" <td>U dun say so early hor... U c already then say...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>ham</td>\n",
" <td>Nah I don't think he goes to usf, he lives aro...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" v1 v2\n",
"0 ham Go until jurong point, crazy.. Available only ...\n",
"1 ham Ok lar... Joking wif u oni...\n",
"2 spam Free entry in 2 a wkly comp to win FA Cup fina...\n",
"3 ham U dun say so early hor... U c already then say...\n",
"4 ham Nah I don't think he goes to usf, he lives aro..."
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "7b2dac92",
"metadata": {},
"outputs": [],
"source": [
"X=data.v2\n",
"Y=data.v1"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "45b59096",
"metadata": {},
"outputs": [],
"source": [
"x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=.3)\n",
"vectorizer=CountVectorizer(min_df=5)\n",
"counts=vectorizer.fit_transform(x_train.values)"
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "e6692882",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"matrix([[0, 0, 0, ..., 0, 0, 0],\n",
" [0, 0, 0, ..., 0, 0, 0],\n",
" [0, 0, 0, ..., 0, 0, 0],\n",
" ...,\n",
" [0, 0, 0, ..., 0, 0, 0],\n",
" [0, 0, 0, ..., 0, 0, 0],\n",
" [0, 0, 0, ..., 0, 0, 0]])"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"counts.todense()"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "543c9929",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"MultinomialNB()"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clf=MultinomialNB()\n",
"targets=y_train.values\n",
"clf.fit(counts,targets)"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "ebcf6f81",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[1433, 11],\n",
" [ 15, 213]])"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_pred=clf.predict(vectorizer.transform(x_test))\n",
"confusion_matrix(y_test,y_pred)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment