#1.2 Read the Auto.csv-dataset into memory
#Importing all packages first
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import sklearn

auto = pd.read_csv('Auto.csv')
auto.head()

# Print unique values in 'horsepower' column *before* processing
print("Unique values in 'horsepower' before processing:", auto['horsepower'].unique())

Unique values in 'horsepower' before processing: ['130' '165' '150' '140' '198' '220' '215' '225' '190' '170' '160' '95'
 '97' '85' '88' '46' '87' '90' '113' '200' '210' '193' '?' '100' '105'
 '175' '153' '180' '110' '72' '86' '70' '76' '65' '69' '60' '80' '54'
 '208' '155' '112' '92' '145' '137' '158' '167' '94' '107' '230' '49' '75'
 '91' '122' '67' '83' '78' '52' '61' '93' '148' '129' '96' '71' '98' '115'
 '53' '81' '79' '120' '152' '102' '108' '68' '58' '149' '89' '63' '48'
 '66' '139' '103' '125' '133' '138' '135' '142' '77' '62' '132' '84' '64'
 '74' '116' '82']

#1.3 In the horsepower-column, some values are missing. These are encoded with ‘?’. Remove these rows from the dataset.
#Updating the auto-data frame so that the new frame filters out rows where the value of horsepower is '?'

auto = auto[auto['horsepower'] != '?']
print("Unique values in 'horsepower' after removing '?':", auto['horsepower'].unique())

Unique values in 'horsepower' after removing '?': ['130' '165' '150' '140' '198' '220' '215' '225' '190' '170' '160' '95'
 '97' '85' '88' '46' '87' '90' '113' '200' '210' '193' '100' '105' '175'
 '153' '180' '110' '72' '86' '70' '76' '65' '69' '60' '80' '54' '208'
 '155' '112' '92' '145' '137' '158' '167' '94' '107' '230' '49' '75' '91'
 '122' '67' '83' '78' '52' '61' '93' '148' '129' '96' '71' '98' '115' '53'
 '81' '79' '120' '152' '102' '108' '68' '58' '149' '89' '63' '48' '66'
 '139' '103' '125' '133' '138' '135' '142' '77' '62' '132' '84' '64' '74'
 '116' '82']

#1.4 Create a new column ‘muscle’. This column should contain a 1 for all muscle cars (e.g. cars that have above average horsepower) and 0 for the rest. 
#(HINT: To help you out I have calculated that the mean horsepower is 104.46. After creating the column you should end up with approximately 148 
#muscle cars and 244 others (assuming that you have already removed the rows with horsepower=’?’ above)).

#For some reason horsepower was read as string, so here I'm converting it to numeric values. Coercing errors replaces empty slots with NaN,
#which isn't really necessary here since we already cleaned those values up.
auto['horsepower'] = pd.to_numeric(auto['horsepower'], errors='coerce')

#Adding a column "muscle" to the dataframe, where entries get values 0 or 1 depending on their horsepowers (1 meaning muscle car)
auto['muscle'] = np.where(auto['horsepower'] >= 104.46, 1, 0)

#Counting the number of muscle cars (or entries with value 1)
muscle_car_count = auto['muscle'].sum()
print(muscle_car_count)
#Double checking if the 0-entries made it to the dataframe. 
non_muscle_car_count = (auto['muscle'] == 0).sum()
print(non_muscle_car_count)

148
244

#1.5 Split the dataset into a training set and a test set, by randomly drawing 80% of the rows for the former and 20% of the rows for the latter. 
#We will get into why this is a good idea in later lectures, but its good practice to start with evaluating the models in held-out data from 
#the get go.

def train_test_split_pandas(df, test_size=0.2):
    indices = np.random.permutation(len(df))
    split_index = int((1 - test_size) * len(df))
    train_set = indices[:split_index]
    test_set = indices[split_index:]
    return df.iloc[train_set], df.iloc[test_set]
    
#Splitting the data using a numpy function, defining the test size as 20%
#Giving random indices to the entire length of the dataframe
#Choosing a split index that is 80% of the length of the dataframe
#Training data will be before the index split
#Testing data will be after the index split
#Returns dataframes for the training set and the test set

# Splitting the data
train_df, test_df = train_test_split_pandas(auto)

# Inspecting the frames
print("Training set shape:", train_df.shape)
print("Test set shape:", test_df.shape)

Training set shape: (313, 10)
Test set shape: (79, 10)

#2.1. Fit a simple linear regression model using horsepower as the predictor and mpg as the outcome using the training data.
auto.columns

auto['horsepower'] = pd.to_numeric(auto['horsepower'])

X = pd.DataFrame({'intercept': np.ones(train_df.shape[0]),
                   'horsepower': train_df['horsepower']})
y = train_df['mpg']

model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    mpg   R-squared:                       0.587
Model:                            OLS   Adj. R-squared:                  0.586
Method:                 Least Squares   F-statistic:                     442.6
Date:                Thu, 06 Mar 2025   Prob (F-statistic):           9.93e-62
Time:                        15:01:20   Log-Likelihood:                -951.80
No. Observations:                 313   AIC:                             1908.
Df Residuals:                     311   BIC:                             1915.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
intercept     39.2874      0.813     48.310      0.000      37.687      40.888
horsepower    -0.1520      0.007    -21.039      0.000      -0.166      -0.138
==============================================================================
Omnibus:                       11.927   Durbin-Watson:                   1.880
Prob(Omnibus):                  0.003   Jarque-Bera (JB):               12.412
Skew:                           0.486   Prob(JB):                      0.00202
Kurtosis:                       3.088   Cond. No.                         319.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

#2.2 Create a scatter plot with horsepower on the x-axis and mpg on the y-axis using the testing data. Plot the regression line found by the model 
#in the plot (Hint: To achieve this you need to find the intercept and the single coefficient of the model, generate predictions or use built-in 
#plotting functionality).
# 7. Get the intercept and coefficient
intercept = results.params['intercept']
horsepower_coef = results.params['horsepower']

# 8. Create the scatterplot using the *TESTING* data
plt.figure(figsize=(10, 6))
plt.scatter(test_df['horsepower'], test_df['mpg'], label='Test Data Points')

# 9. Generate points for the regression line
x_values = np.linspace(test_df['horsepower'].min(), test_df['horsepower'].max(), 100)
y_values = intercept + horsepower_coef * x_values

# 10. Plot the regression line
plt.plot(x_values, y_values, color='red', label='Regression Line')

# 11. Add labels and title
plt.xlabel('Horsepower')
plt.ylabel('MPG')
plt.title('Scatterplot of MPG vs. Horsepower (Test Data) with Regression Line')
plt.legend()
plt.grid(True)

# 12. Show the plot
plt.show()

#2.3 Use the model to generate predictions for the training set. Calculate and report the mean absolute error (MAE) of these predictions.
from sklearn.metrics import mean_absolute_error

#I must honestly say I do not understand why this code needs to be repeated just because it's a new cell
# 5. Create X and y using the training data
X_train = pd.DataFrame({'intercept': np.ones(train_df.shape[0]),
                       'horsepower': train_df['horsepower']})
y_train = train_df['mpg']

# 6. Fit the model
model = sm.OLS(y_train, X_train)
results = model.fit()

# 1. Generate predictions on the training data
predictions_train = results.predict(X_train)

# 2. Calculate the MAE on the training data
mae_train = mean_absolute_error(y_train, predictions_train)

# 3. Print the MAE on the training data
print("Mean Absolute Error on Training Data:", mae_train)

Mean Absolute Error on Training Data: 4.023020602728515

#2.4 Use the model to generate predictions for the test set. Calculate and report the MAE of the predictions.
#I still don't understand why it's necessary to repeat code when it's in the same document.

from sklearn.metrics import mean_absolute_error

# 1. Create X for the test set
X_test = pd.DataFrame({'intercept': np.ones(test_df.shape[0]),
                       'horsepower': test_df['horsepower']})
y_test = test_df['mpg']

# 2. Generate predictions on the test data
predictions_test = results.predict(X_test)

# 3. Calculate the MAE on the test data
mae_test = mean_absolute_error(y_test, predictions_test)

# 4. Print the MAE on the test data
print("Mean Absolute Error on Test Data:", mae_test)

Mean Absolute Error on Test Data: 3.1371164727250793

#2.5 Reflection: Is the training or testing MAE is lower? Does this match your expectation? What would be the general pattern we expect here 
#(e.g. one is lower than the other, they are the same, ...), and why do we expect that?

#The mean average error on the testing data is lower because we fitted the model to make the predictions more accurate.

#3.1 Fit a multivariate linear regression model using horsepower, weight, displacement, and year as predictors and mpg as the outcome.

# Create X and y using the training data
X_train = pd.DataFrame({'intercept': np.ones(train_df.shape[0]),
                       'horsepower': train_df['horsepower'],
                        'weight': train_df['weight'],
                        'displacement': train_df['displacement'],
                        'year': train_df['year']})
y_train = train_df['mpg']

# Create X and y using the testing data
X_test = pd.DataFrame({'intercept': np.ones(test_df.shape[0]),
                       'horsepower': test_df['horsepower'],
                        'weight': test_df['weight'],
                        'displacement': test_df['displacement'],
                        'year': test_df['year']})
y_test = test_df['mpg']

# Fit the model
model = sm.OLS(y_train, X_train)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    mpg   R-squared:                       0.816
Model:                            OLS   Adj. R-squared:                  0.813
Method:                 Least Squares   F-statistic:                     340.9
Date:                Thu, 06 Mar 2025   Prob (F-statistic):          9.32e-112
Time:                        15:19:40   Log-Likelihood:                -825.58
No. Observations:                 313   AIC:                             1661.
Df Residuals:                     308   BIC:                             1680.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
================================================================================
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
intercept      -15.5838      4.687     -3.325      0.001     -24.806      -6.362
horsepower      -0.0005      0.011     -0.047      0.963      -0.022       0.021
weight          -0.0071      0.001    -11.089      0.000      -0.008      -0.006
displacement     0.0038      0.006      0.651      0.516      -0.008       0.015
year             0.7819      0.059     13.308      0.000       0.666       0.897
==============================================================================
Omnibus:                       29.781   Durbin-Watson:                   2.019
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               44.176
Skew:                           0.629   Prob(JB):                     2.55e-10
Kurtosis:                       4.343   Cond. No.                     7.60e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 7.6e+04. This might indicate that there are
strong multicollinearity or other numerical problems.

#3.2 Print the intercept and coefficients of the model (these should be identifiable such that I’m able to deduce which coefficient belongs to 
#which variable). What can you say about the relationship between mpg and the year-variable based on these?

# Print the intercept and coefficients
print("Intercept:", results.params['intercept'])
print("Horsepower:", results.params['horsepower'])
print("Weight:", results.params['weight'])
print("Displacement:", results.params['displacement'])
print("Year:", results.params['year'])

#Holding all other variables constant, for each one-year increase in the model year, the MPG is predicted to increase by about 0.78 miles per gallon.

Intercept: -15.583842593461753
Horsepower: -0.0005217407555629452
Weight: -0.007082942382751896
Displacement: 0.003795476635871032
Year: 0.7818844825299315

#3.3 Use the model to generate predictions for the training set. Calculate and report the MAE of these predictions.
from sklearn.metrics import mean_absolute_error

# Generate predictions on the training data
predictions_train = results.predict(X_train)

# Calculate the MAE on the training data
mae_train = mean_absolute_error(y_train, predictions_train)

# Print the MAE on the training data
print("Mean Absolute Error on Training Data (Multivariate):", mae_train)

Mean Absolute Error on Training Data (Multivariate): 2.6595989573181384

#3.4 Use the model to generate predictions for the test set. Calculate and report the MAE of the predictions.
from sklearn.metrics import mean_absolute_error

# Generate predictions on the test data
predictions_test = results.predict(X_test)

# Calculate the MAE on the test data
mae_test = mean_absolute_error(y_test, predictions_test)

# Print the MAE on the test data
print("Mean Absolute Error on Test Data (Multivariate):", mae_test)

Mean Absolute Error on Test Data (Multivariate): 2.4730985734382527

#3.5 Reflection: Is the training MAE lower or higher than in the simple linear regression model? Does it have to be this way, or could it have 
#been otherwise? What about the testing MAE?

#The MAE is lower for the test data than the training data, again because we already fitted the model.
#It could have been otherwise if we had overfitted or underfitted the model.
#The multivariate regression was a better predictor because there is less error than the bivariate regression

#4.1 Fit a logistic regression model using weight, displacement and year as predictors and our newly created muscle-column as the outcome. 
#Why don’t we use horsepower as a predictor in this model?

X = auto[['weight', 'displacement', 'year']]
y = auto['muscle']

# Add a constant to the predictors
X = sm.add_constant(X)

# Fit the logistic regression model
model = sm.Logit(y, X)
results = model.fit()

# Print the model summary
print(results.summary())

#We don't use horsepower as a predictor in this model because the outcome is horsepower.

Optimization terminated successfully.
         Current function value: 0.274044
         Iterations 8
                           Logit Regression Results                           
==============================================================================
Dep. Variable:                 muscle   No. Observations:                  392
Model:                          Logit   Df Residuals:                      388
Method:                           MLE   Df Model:                            3
Date:                Thu, 06 Mar 2025   Pseudo R-squ.:                  0.5866
Time:                        15:38:32   Log-Likelihood:                -107.43
converged:                       True   LL-Null:                       -259.84
Covariance Type:            nonrobust   LLR p-value:                 8.982e-66
================================================================================
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
const          -10.6524      4.360     -2.443      0.015     -19.198      -2.107
weight           0.0024      0.001      3.951      0.000       0.001       0.004
displacement     0.0107      0.004      2.424      0.015       0.002       0.019
year             0.0088      0.055      0.160      0.873      -0.099       0.116
================================================================================

#4.2 Use the model to generate predictions for the training set. Calculate and report the accuracy of these predictions.

from sklearn.metrics import accuracy_score

# Generate predictions on the training data
predictions = results.predict(X)

# Convert probabilities to binary predictions (0 or 1)
binary_predictions = [1 if x >= 0.5 else 0 for x in predictions]

# Calculate the accuracy
accuracy = accuracy_score(y, binary_predictions)

# Print the accuracy
print("Accuracy:", accuracy)

Accuracy: 0.8673469387755102

#4.3 Use the model to generate predictions for the testing set. Calculate and report the accuracy of these predictions.

# Use the model to generate predictions for the testing set and Calculate the accuracy
predictions = results.predict(X_test)

# Convert probabilities to binary predictions (0 or 1)
binary_predictions = [1 if x >= 0.5 else 0 for x in predictions]

# Calculate the accuracy
accuracy = accuracy_score(y_test, binary_predictions)

# Print the accuracy
print("Accuracy:", accuracy)

Accuracy: 0.8860759493670886

Assignment 2¶

Exercise 1¶

preparations¶

Exercise 2¶

Simple linear regression¶

Exercise 3¶

Multivariate linear regression¶

Exercise 4¶

Logistic regression¶