Skip to content

Commit 783ad7f

Browse files
committedApr 18, 2022
add logistic regression using pytorch tutorial
1 parent 46347a8 commit 783ad7f

File tree

5 files changed

+574
-0
lines changed

5 files changed

+574
-0
lines changed
 

‎README.md

+1
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ This is a repository of all the tutorials of [The Python Code](https://www.thepy
8787
- [Customer Churn Prediction in Python](https://www.thepythoncode.com/article/customer-churn-detection-using-sklearn-in-python). ([code](machine-learning/customer-churn-detection))
8888
- [Recommender Systems using Association Rules Mining in Python](https://www.thepythoncode.com/article/build-a-recommender-system-with-association-rule-mining-in-python). ([code](machine-learning/recommender-system-using-association-rules))
8989
- [Handling Imbalanced Datasets: A Case Study with Customer Churn](https://www.thepythoncode.com/article/handling-imbalanced-datasets-sklearn-in-python). ([code](machine-learning/handling-inbalance-churn-data))
90+
- [Logistic Regression using PyTorch in Python](https://www.thepythoncode.com/article/logistic-regression-using-pytorch). ([code](machine-learning/logistic-regression-in-pytorch))
9091

9192
- ### [General Python Topics](https://www.thepythoncode.com/topic/general-python-topics)
9293
- [How to Make Facebook Messenger bot in Python](https://www.thepythoncode.com/article/make-bot-fbchat-python). ([code](general/messenger-bot))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,409 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {
7+
"colab": {
8+
"base_uri": "https://localhost:8080/"
9+
},
10+
"id": "YUssqHFr0VM8",
11+
"outputId": "47dcb9c2-b276-43f3-9263-ef9c7fcdf45a"
12+
},
13+
"outputs": [],
14+
"source": [
15+
"!gdown --id 12vfq3DYFId3bsXuNj_PhsACMzrLTfObs"
16+
]
17+
},
18+
{
19+
"cell_type": "code",
20+
"execution_count": null,
21+
"metadata": {
22+
"id": "CLw5KFxzz-vw"
23+
},
24+
"outputs": [],
25+
"source": [
26+
"import pandas as pd\n",
27+
"import numpy as np\n",
28+
"import torch\n",
29+
"import torch.nn as nn\n",
30+
"from sklearn.utils import resample\n",
31+
"from sklearn import preprocessing\n",
32+
"from sklearn.preprocessing import StandardScaler\n",
33+
"from sklearn.model_selection import train_test_split\n",
34+
"from warnings import filterwarnings\n",
35+
"filterwarnings('ignore')"
36+
]
37+
},
38+
{
39+
"cell_type": "code",
40+
"execution_count": null,
41+
"metadata": {
42+
"colab": {
43+
"base_uri": "https://localhost:8080/",
44+
"height": 379
45+
},
46+
"id": "llEWd-dM0ZQg",
47+
"outputId": "c8c3348b-d68c-4f7d-c587-5279e4b867b7"
48+
},
49+
"outputs": [],
50+
"source": [
51+
"#reading data\n",
52+
"data = pd.read_csv(\"data_regression.csv\")\n",
53+
"##The dimension of the data is seen, and the output column is checked to see whether it is continuous or discrete. \n",
54+
"##In this case, the output is discrete, so a classification algorithm should be applied.\n",
55+
"data = data.drop([\"year\", \"customer_id\", \"phone_no\"], axis=1)\n",
56+
"print(data.shape) # Lookiing the shape of the data\n",
57+
"print(data.columns) # Looking how many columns data has\n",
58+
"data.dtypes \n",
59+
"data.head()"
60+
]
61+
},
62+
{
63+
"cell_type": "code",
64+
"execution_count": null,
65+
"metadata": {
66+
"colab": {
67+
"base_uri": "https://localhost:8080/"
68+
},
69+
"id": "wHAGp4M10cUr",
70+
"outputId": "20095ecf-22fd-4a0a-d1ee-e125d303154b"
71+
},
72+
"outputs": [],
73+
"source": [
74+
"data.isnull().sum()"
75+
]
76+
},
77+
{
78+
"cell_type": "code",
79+
"execution_count": null,
80+
"metadata": {
81+
"colab": {
82+
"base_uri": "https://localhost:8080/",
83+
"height": 270
84+
},
85+
"id": "qt8ctl6m0gm-",
86+
"outputId": "eb870fb2-a28d-4e0a-f6f7-55771f2bdaa7"
87+
},
88+
"outputs": [],
89+
"source": [
90+
"final_data = data.dropna() # Dropping the null values\n",
91+
"final_data.head()"
92+
]
93+
},
94+
{
95+
"cell_type": "code",
96+
"execution_count": null,
97+
"metadata": {
98+
"colab": {
99+
"base_uri": "https://localhost:8080/"
100+
},
101+
"id": "yL_bQ-mH0hzn",
102+
"outputId": "1537150a-c09c-4abb-b46c-082ee6bff1ce"
103+
},
104+
"outputs": [],
105+
"source": [
106+
"final_data[\"churn\"].value_counts() \n",
107+
"# let us see how many data is there in each class for deciding the sampling data number"
108+
]
109+
},
110+
{
111+
"cell_type": "code",
112+
"execution_count": null,
113+
"metadata": {
114+
"colab": {
115+
"base_uri": "https://localhost:8080/"
116+
},
117+
"id": "TqetEXX50i7Z",
118+
"outputId": "632fbf54-5189-4d61-8d8f-fb1a6735aeb0"
119+
},
120+
"outputs": [],
121+
"source": [
122+
"data_majority = final_data[final_data['churn']==0] # class 0\n",
123+
"data_minority = final_data[final_data['churn']==1] # class 1\n",
124+
"# upsampling minority class\n",
125+
"data_minority_upsampled = resample(data_minority, replace=True, n_samples=900, random_state=123) \n",
126+
"# downsampling majority class\n",
127+
"data_majority_downsampled = resample(data_majority, replace=False, n_samples=900, random_state=123)\n",
128+
"# concanating both upsampled and downsampled class\n",
129+
"## Data Concatenation: Concatenating the dataframe after upsampling and downsampling \n",
130+
"# concanating both upsampled and downsampled class\n",
131+
"data2 = pd.concat([data_majority_downsampled, data_minority_upsampled])\n",
132+
"## Encoding Catagoricals: We need to encode the categorical variables before feeding it to the model\n",
133+
"data2[['gender', 'multi_screen', 'mail_subscribed']]\n",
134+
"# label encoding categorical variables\n",
135+
"label_encoder = preprocessing.LabelEncoder()\n",
136+
"data2['gender'] = label_encoder.fit_transform(data2['gender'])\n",
137+
"data2['multi_screen'] = label_encoder.fit_transform(data2['multi_screen'])\n",
138+
"data2['mail_subscribed'] = label_encoder.fit_transform(data2['mail_subscribed'])\n",
139+
"## Lets now check again the distribution of the oputut class after sampling\n",
140+
"data2[\"churn\"].value_counts()"
141+
]
142+
},
143+
{
144+
"cell_type": "code",
145+
"execution_count": null,
146+
"metadata": {
147+
"colab": {
148+
"base_uri": "https://localhost:8080/"
149+
},
150+
"id": "GUpLqdEw0tXb",
151+
"outputId": "041cd5a3-f9ec-447a-c0d9-f09e1e8bd4e0"
152+
},
153+
"outputs": [],
154+
"source": [
155+
"# indenpendent variable \n",
156+
"X = data2.iloc[:,:-1]\n",
157+
"## This X will be fed to the model to learn params \n",
158+
"#scaling the data\n",
159+
"sc = StandardScaler() # Bringing the mean to 0 and variance to 1, so as to have a non-noisy optimization\n",
160+
"X = sc.fit_transform(X)\n",
161+
"X = sc.transform(X)\n",
162+
"## Keeping the output column in a separate dataframe\n",
163+
"data2 = data2.sample(frac=1).reset_index(drop=True) ## Shuffle the data frame and reset index\n",
164+
"n_samples, n_features = X.shape ## n_samples is the number of samples and n_features is the number of features\n",
165+
"#output column\n",
166+
"Y = data2[\"churn\"]\n",
167+
"#output column\n",
168+
"Y = data2[\"churn\"]\n",
169+
"##Data Splitting: \n",
170+
"## The data is processed, so now we can split the data into train and test to train the model with training data and test it later from testing data.\n",
171+
"#splitting data into train and test\n",
172+
"X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=42, stratify = Y)\n",
173+
"print((y_train == 1).sum())\n",
174+
"print((y_train == 0).sum())"
175+
]
176+
},
177+
{
178+
"cell_type": "code",
179+
"execution_count": null,
180+
"metadata": {
181+
"colab": {
182+
"base_uri": "https://localhost:8080/"
183+
},
184+
"id": "VzYgdjlU0tof",
185+
"outputId": "5e188583-3dcd-4ddc-8b1b-dee6bb7b524a"
186+
},
187+
"outputs": [],
188+
"source": [
189+
"print(type(X_train))\n",
190+
"print(type(X_test))\n",
191+
"print(type(y_train.values))\n",
192+
"print(type(y_test.values))"
193+
]
194+
},
195+
{
196+
"cell_type": "code",
197+
"execution_count": null,
198+
"metadata": {
199+
"id": "y0raeQhA0u4z"
200+
},
201+
"outputs": [],
202+
"source": [
203+
"X_train = torch.from_numpy(X_train.astype(np.float32))\n",
204+
"X_test = torch.from_numpy(X_test.astype(np.float32))\n",
205+
"y_train = torch.from_numpy(y_train.values.astype(np.float32))\n",
206+
"y_test = torch.from_numpy(y_test.values.astype(np.float32))"
207+
]
208+
},
209+
{
210+
"cell_type": "code",
211+
"execution_count": null,
212+
"metadata": {
213+
"colab": {
214+
"base_uri": "https://localhost:8080/"
215+
},
216+
"id": "TBb_XrZt00_p",
217+
"outputId": "4c44564c-a8c1-4071-9527-afebd5fa310f"
218+
},
219+
"outputs": [],
220+
"source": [
221+
"y_train.shape, y_test.shape"
222+
]
223+
},
224+
{
225+
"cell_type": "code",
226+
"execution_count": null,
227+
"metadata": {
228+
"id": "6-9CiBo40vxM"
229+
},
230+
"outputs": [],
231+
"source": [
232+
"y_train = y_train.view(y_train.shape[0], 1)\n",
233+
"y_test = y_test.view(y_test.shape[0], 1)"
234+
]
235+
},
236+
{
237+
"cell_type": "code",
238+
"execution_count": null,
239+
"metadata": {
240+
"colab": {
241+
"base_uri": "https://localhost:8080/"
242+
},
243+
"id": "07XkWAA20w0U",
244+
"outputId": "63cb82b1-6650-442d-c911-cec1a0778e20"
245+
},
246+
"outputs": [],
247+
"source": [
248+
"y_train.shape, y_test.shape"
249+
]
250+
},
251+
{
252+
"cell_type": "code",
253+
"execution_count": null,
254+
"metadata": {
255+
"id": "fQxkh0FK02i5"
256+
},
257+
"outputs": [],
258+
"source": [
259+
"# logistic regression class\n",
260+
"class LogisticRegression(nn.Module):\n",
261+
" def __init__(self, n_input_features):\n",
262+
" super(LogisticRegression, self).__init__()\n",
263+
" self.linear = nn.Linear(n_input_features, 1)\n",
264+
" \n",
265+
" #sigmoid transformation of the input \n",
266+
" def forward(self, x):\n",
267+
" y_pred = torch.sigmoid(self.linear(x))\n",
268+
" return y_pred"
269+
]
270+
},
271+
{
272+
"cell_type": "code",
273+
"execution_count": null,
274+
"metadata": {
275+
"id": "C8GvUnxQ05Ik"
276+
},
277+
"outputs": [],
278+
"source": [
279+
"lr = LogisticRegression(n_features)"
280+
]
281+
},
282+
{
283+
"cell_type": "code",
284+
"execution_count": null,
285+
"metadata": {
286+
"id": "RdYvQs1a06JP"
287+
},
288+
"outputs": [],
289+
"source": [
290+
"num_epochs = 500\n",
291+
"# Traning the model for large number of epochs to see better results \n",
292+
"learning_rate = 0.0001\n",
293+
"criterion = nn.BCELoss() \n",
294+
"# We are working on lgistic regression so using Binary Cross Entropy\n",
295+
"optimizer = torch.optim.SGD(lr.parameters(), lr=learning_rate) "
296+
]
297+
},
298+
{
299+
"cell_type": "code",
300+
"execution_count": null,
301+
"metadata": {
302+
"colab": {
303+
"base_uri": "https://localhost:8080/"
304+
},
305+
"id": "qT5pK7jr0_Ez",
306+
"outputId": "abf0e908-173d-447f-f8bf-0ce55c9907e3"
307+
},
308+
"outputs": [],
309+
"source": [
310+
"for epoch in range(num_epochs):\n",
311+
" y_pred = lr(X_train)\n",
312+
" loss = criterion(y_pred, y_train) \n",
313+
" loss.backward()\n",
314+
" optimizer.step()\n",
315+
" optimizer.zero_grad()\n",
316+
" if (epoch+1) % 20 == 0: \n",
317+
" # printing loss values on every 10 epochs to keep track\n",
318+
" print(f'epoch: {epoch+1}, loss = {loss.item():.4f}')"
319+
]
320+
},
321+
{
322+
"cell_type": "code",
323+
"execution_count": null,
324+
"metadata": {
325+
"colab": {
326+
"base_uri": "https://localhost:8080/"
327+
},
328+
"id": "KYDPNSBm1C_T",
329+
"outputId": "77c4070e-20de-4cb0-94b1-33962a36cce8"
330+
},
331+
"outputs": [],
332+
"source": [
333+
"with torch.no_grad():\n",
334+
" y_predicted = lr(X_test)\n",
335+
" y_predicted_cls = y_predicted.round()\n",
336+
" acc = y_predicted_cls.eq(y_test).sum() / float(y_test.shape[0])\n",
337+
" print(f'accuracy: {acc.item():.4f}')"
338+
]
339+
},
340+
{
341+
"cell_type": "code",
342+
"execution_count": null,
343+
"metadata": {
344+
"colab": {
345+
"base_uri": "https://localhost:8080/"
346+
},
347+
"id": "0miFH7DO1oOq",
348+
"outputId": "38352b86-1590-490a-9f45-377ed543a3bc"
349+
},
350+
"outputs": [],
351+
"source": [
352+
"#classification report\n",
353+
"from sklearn.metrics import classification_report\n",
354+
"print(classification_report(y_test, y_predicted_cls))"
355+
]
356+
},
357+
{
358+
"cell_type": "code",
359+
"execution_count": null,
360+
"metadata": {
361+
"colab": {
362+
"base_uri": "https://localhost:8080/"
363+
},
364+
"id": "BXKCNp_q2zhp",
365+
"outputId": "2fe7e571-64a6-4dc5-9be7-20d365a96a05"
366+
},
367+
"outputs": [],
368+
"source": [
369+
"#confusion matrix\n",
370+
"from sklearn.metrics import confusion_matrix\n",
371+
"confusion_matrix = confusion_matrix(y_test, y_predicted_cls)\n",
372+
"print(confusion_matrix)"
373+
]
374+
},
375+
{
376+
"cell_type": "code",
377+
"execution_count": null,
378+
"metadata": {
379+
"id": "x6l2_Yxr21kT"
380+
},
381+
"outputs": [],
382+
"source": []
383+
}
384+
],
385+
"metadata": {
386+
"colab": {
387+
"name": "LogisticRegressionPyTorch_PythonCodeTutorial.ipynb",
388+
"provenance": []
389+
},
390+
"kernelspec": {
391+
"display_name": "Python 3",
392+
"name": "python3"
393+
},
394+
"language_info": {
395+
"codemirror_mode": {
396+
"name": "ipython",
397+
"version": 3
398+
},
399+
"file_extension": ".py",
400+
"mimetype": "text/x-python",
401+
"name": "python",
402+
"nbconvert_exporter": "python",
403+
"pygments_lexer": "ipython3",
404+
"version": "3.9.12"
405+
}
406+
},
407+
"nbformat": 4,
408+
"nbformat_minor": 0
409+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# [Logistic Regression using PyTorch in Python](https://www.thepythoncode.com/article/logistic-regression-using-pytorch)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
# %%
2+
!gdown --id 12vfq3DYFId3bsXuNj_PhsACMzrLTfObs
3+
4+
# %%
5+
import pandas as pd
6+
import numpy as np
7+
import torch
8+
import torch.nn as nn
9+
from sklearn.utils import resample
10+
from sklearn import preprocessing
11+
from sklearn.preprocessing import StandardScaler
12+
from sklearn.model_selection import train_test_split
13+
from warnings import filterwarnings
14+
filterwarnings('ignore')
15+
16+
# %%
17+
#reading data
18+
data = pd.read_csv("data_regression.csv")
19+
##The dimension of the data is seen, and the output column is checked to see whether it is continuous or discrete.
20+
##In this case, the output is discrete, so a classification algorithm should be applied.
21+
data = data.drop(["year", "customer_id", "phone_no"], axis=1)
22+
print(data.shape) # Lookiing the shape of the data
23+
print(data.columns) # Looking how many columns data has
24+
data.dtypes
25+
data.head()
26+
27+
# %%
28+
data.isnull().sum()
29+
30+
# %%
31+
final_data = data.dropna() # Dropping the null values
32+
final_data.head()
33+
34+
# %%
35+
final_data["churn"].value_counts()
36+
# let us see how many data is there in each class for deciding the sampling data number
37+
38+
# %%
39+
data_majority = final_data[final_data['churn']==0] # class 0
40+
data_minority = final_data[final_data['churn']==1] # class 1
41+
# upsampling minority class
42+
data_minority_upsampled = resample(data_minority, replace=True, n_samples=900, random_state=123)
43+
# downsampling majority class
44+
data_majority_downsampled = resample(data_majority, replace=False, n_samples=900, random_state=123)
45+
# concanating both upsampled and downsampled class
46+
## Data Concatenation: Concatenating the dataframe after upsampling and downsampling
47+
# concanating both upsampled and downsampled class
48+
data2 = pd.concat([data_majority_downsampled, data_minority_upsampled])
49+
## Encoding Catagoricals: We need to encode the categorical variables before feeding it to the model
50+
data2[['gender', 'multi_screen', 'mail_subscribed']]
51+
# label encoding categorical variables
52+
label_encoder = preprocessing.LabelEncoder()
53+
data2['gender'] = label_encoder.fit_transform(data2['gender'])
54+
data2['multi_screen'] = label_encoder.fit_transform(data2['multi_screen'])
55+
data2['mail_subscribed'] = label_encoder.fit_transform(data2['mail_subscribed'])
56+
## Lets now check again the distribution of the oputut class after sampling
57+
data2["churn"].value_counts()
58+
59+
# %%
60+
# indenpendent variable
61+
X = data2.iloc[:,:-1]
62+
## This X will be fed to the model to learn params
63+
#scaling the data
64+
sc = StandardScaler() # Bringing the mean to 0 and variance to 1, so as to have a non-noisy optimization
65+
X = sc.fit_transform(X)
66+
X = sc.transform(X)
67+
## Keeping the output column in a separate dataframe
68+
data2 = data2.sample(frac=1).reset_index(drop=True) ## Shuffle the data frame and reset index
69+
n_samples, n_features = X.shape ## n_samples is the number of samples and n_features is the number of features
70+
#output column
71+
Y = data2["churn"]
72+
#output column
73+
Y = data2["churn"]
74+
##Data Splitting:
75+
## The data is processed, so now we can split the data into train and test to train the model with training data and test it later from testing data.
76+
#splitting data into train and test
77+
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=42, stratify = Y)
78+
print((y_train == 1).sum())
79+
print((y_train == 0).sum())
80+
81+
# %%
82+
print(type(X_train))
83+
print(type(X_test))
84+
print(type(y_train.values))
85+
print(type(y_test.values))
86+
87+
# %%
88+
X_train = torch.from_numpy(X_train.astype(np.float32))
89+
X_test = torch.from_numpy(X_test.astype(np.float32))
90+
y_train = torch.from_numpy(y_train.values.astype(np.float32))
91+
y_test = torch.from_numpy(y_test.values.astype(np.float32))
92+
93+
# %%
94+
y_train.shape, y_test.shape
95+
96+
# %%
97+
y_train = y_train.view(y_train.shape[0], 1)
98+
y_test = y_test.view(y_test.shape[0], 1)
99+
100+
# %%
101+
y_train.shape, y_test.shape
102+
103+
# %%
104+
# logistic regression class
105+
class LogisticRegression(nn.Module):
106+
def __init__(self, n_input_features):
107+
super(LogisticRegression, self).__init__()
108+
self.linear = nn.Linear(n_input_features, 1)
109+
110+
#sigmoid transformation of the input
111+
def forward(self, x):
112+
y_pred = torch.sigmoid(self.linear(x))
113+
return y_pred
114+
115+
# %%
116+
lr = LogisticRegression(n_features)
117+
118+
# %%
119+
num_epochs = 500
120+
# Traning the model for large number of epochs to see better results
121+
learning_rate = 0.0001
122+
criterion = nn.BCELoss()
123+
# We are working on lgistic regression so using Binary Cross Entropy
124+
optimizer = torch.optim.SGD(lr.parameters(), lr=learning_rate)
125+
126+
# %%
127+
for epoch in range(num_epochs):
128+
y_pred = lr(X_train)
129+
loss = criterion(y_pred, y_train)
130+
loss.backward()
131+
optimizer.step()
132+
optimizer.zero_grad()
133+
if (epoch+1) % 20 == 0:
134+
# printing loss values on every 10 epochs to keep track
135+
print(f'epoch: {epoch+1}, loss = {loss.item():.4f}')
136+
137+
# %%
138+
with torch.no_grad():
139+
y_predicted = lr(X_test)
140+
y_predicted_cls = y_predicted.round()
141+
acc = y_predicted_cls.eq(y_test).sum() / float(y_test.shape[0])
142+
print(f'accuracy: {acc.item():.4f}')
143+
144+
# %%
145+
#classification report
146+
from sklearn.metrics import classification_report
147+
print(classification_report(y_test, y_predicted_cls))
148+
149+
# %%
150+
#confusion matrix
151+
from sklearn.metrics import confusion_matrix
152+
confusion_matrix = confusion_matrix(y_test, y_predicted_cls)
153+
print(confusion_matrix)
154+
155+
# %%
156+
157+
158+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
matplotlib
2+
numpy
3+
pandas
4+
scikit_learn==1.0.2
5+
torch==1.10.1

0 commit comments

Comments
 (0)
Please sign in to comment.