ml_project/train_model.ipynb
User Name 95826b438c init
2025-06-07 23:27:56 +02:00

333 lines
52 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "f821f560-6593-48df-8406-2149a30d262a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Hours_Studied</th>\n",
" <th>Test_Score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4.370861</td>\n",
" <td>45.417391</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>9.556429</td>\n",
" <td>104.945142</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>7.587945</td>\n",
" <td>80.631574</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>6.387926</td>\n",
" <td>60.994745</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2.404168</td>\n",
" <td>19.549604</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Hours_Studied Test_Score\n",
"0 4.370861 45.417391\n",
"1 9.556429 104.945142\n",
"2 7.587945 80.631574\n",
"3 6.387926 60.994745\n",
"4 2.404168 19.549604"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.metrics import mean_squared_error, r2_score\n",
"import joblib\n",
"\n",
"# Set random seed for reproducibility\n",
"np.random.seed(42)\n",
"\n",
"# Generate random data for hours studied (Feature1) and test scores (Target)\n",
"n_samples = 500\n",
"X1 = np.random.uniform(1, 10, n_samples) # Random hours studied\n",
"noise = np.random.normal(0, 5, n_samples) # Random noise\n",
"y = 10 * X1 + noise # Linear relationship with noise\n",
"\n",
"\n",
"# Create a DataFrame for easy manipulation\n",
"data = pd.DataFrame({'Hours_Studied': X1, 'Test_Score': y})\n",
"data.head()\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "9028518c-f769-4763-92f5-3fd7f93a1b80",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" Hours_Studied Test_Score\n",
"Hours_Studied 1.000000 0.983399\n",
"Test_Score 0.983399 1.000000\n"
]
}
],
"source": [
"# Scatter plot to visualize the relationship\n",
"plt.scatter(data['Hours_Studied'], data['Test_Score'])\n",
"plt.title('Hours Studied vs Test Score')\n",
"plt.xlabel('Hours Studied')\n",
"plt.ylabel('Test Score')\n",
"plt.show()\n",
"\n",
"# Check the correlation between features and target\n",
"corr_matrix = data.corr()\n",
"print(corr_matrix)\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "4567dd61-0a38-4fae-a10d-9fc5faaacd5d",
"metadata": {},
"outputs": [],
"source": [
"# Split the dataset into independent variables (X) and dependent variable (y)\n",
"X = data[['Hours_Studied']]\n",
"y = data['Test_Score']\n",
"\n",
"# Split the dataset into training and testing sets\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
"\n",
"# Scale the features using StandardScaler\n",
"scaler = StandardScaler()\n",
"X_train_scaled = scaler.fit_transform(X_train)\n",
"X_test_scaled = scaler.transform(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "1a58967d-808e-4105-85ae-c18a493b3250",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Coefficient: [27.31629308]\n",
"Intercept: 55.31588683514129\n"
]
}
],
"source": [
"# Instantiate the linear regression model\n",
"model = LinearRegression()\n",
"\n",
"# Train the model on the training data\n",
"model.fit(X_train_scaled, y_train)\n",
"\n",
"# Check the model's coefficients and intercept\n",
"print(\"Coefficient: \", model.coef_)\n",
"print(\"Intercept: \", model.intercept_)\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "e36d353d-30de-4893-8936-8e2e59b9670f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Predicted Test Score for 6 hours of study: 60.03282292232381\n"
]
}
],
"source": [
"# Example of predicting a single value using a new data point\n",
"new_data = np.array([[6]]) # Example value for hours studied\n",
"new_data_df = pd.DataFrame(new_data, columns=['Hours_Studied']) \n",
"\n",
"# Scale the new data\n",
"new_data_scaled = scaler.transform(new_data_df)\n",
"\n",
"# Predict the test score\n",
"single_prediction = model.predict(new_data_scaled)\n",
"\n",
"print(f\"Predicted Test Score for 6 hours of study: {single_prediction[0]}\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "f1d05789-67f0-4db5-b42a-05a63ec7e63c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Mean Squared Error: 26.105427937987933\n",
"R-squared: 0.9645671742510007\n"
]
}
],
"source": [
"# Predict on the test set\n",
"y_pred = model.predict(X_test_scaled)\n",
"\n",
"\n",
"# Calculate Mean Squared Error (MSE) and R-squared (R²)\n",
"mse = mean_squared_error(y_test, y_pred)\n",
"r2 = r2_score(y_test, y_pred)\n",
"\n",
"print(f\"Mean Squared Error: {mse}\")\n",
"print(f\"R-squared: {r2}\")\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "c8714b02-ed56-4c59-9729-24d1343bc46c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['scaler.pkl']"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Save the trained model to a file\n",
"joblib.dump(model, 'linear_regression_model.pkl')\n",
"joblib.dump(scaler, 'scaler.pkl')"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "a568161c-3337-4743-ba01-8ab6f310bc4b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Predicted Test Score for 6 hours of study: 60.03282292232381\n"
]
}
],
"source": [
"# Load the saved model and scaler\n",
"loaded_model = joblib.load('linear_regression_model.pkl')\n",
"loaded_scaler = joblib.load('scaler.pkl')\n",
"\n",
"# New data for prediction (e.g., 6 hours of study)\n",
"new_data = np.array([[6]])\n",
"new_data_df = pd.DataFrame(new_data, columns=['Hours_Studied'])\n",
"\n",
"# Scale the new data using the loaded scaler\n",
"new_data_scaled = loaded_scaler.transform(new_data_df)\n",
"\n",
"# Make the prediction using the loaded model\n",
"single_prediction = loaded_model.predict(new_data_scaled)\n",
"\n",
"# Output the predicted test score\n",
"print(f\"Predicted Test Score for 6 hours of study: {single_prediction[0]}\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e073c96f-b079-4052-9ecd-e557f75c6cca",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}