Want to create interactive content? It’s easy in Genially!
MACHINE LEARNING 101
sathwik
Created on March 11, 2025
Start designing with a free template
Discover more than 1500 professional designs like these:
View
Puzzle Game
View
Scratch and Win
View
Microlearning: How to Study Better
View
Branching Scenarios Challenge Mobile
View
Branching Scenario Mission: Innovating for the Future
View
Piñata Challenge
View
Teaching Challenge: Transform Your Classroom
Transcript
MACHINE LEARNING 101
A GUIDE TO TRAINING YOUR FIRST MACHINE LEARNING MODEL
GET STARTED
OBJECTIVES
Welcome to this interactive guide on setting up your first machine learning modelThis guide will walk you through each practical step from data preparation to deployment By the end, you'll have created a functioning machine learning model
TOOLS NEEDED
PYTHON
JUPYTER
SCIKIT-LEARN
PANDASMATPLOTLIB FLASK
NEXT PAGE
INDEX
GUIDE
ASSESSMENT
NEXT STEPS
GUIDE
DATA PREPARATION
MODEL SELECTION
FEATURE SELECTION
EVALUATION METRICS
MODEL IMPROVEMENT
TRAINING PROCESS
PREDICTIONS WITH MODEL
MODEL DEPLOYMENT
ASSESSMENT
INDEX
PREPARING YOUR DATA
NEXT PAGE
Download the sample housing dataset provided Load the data using pandas with this code snippet Examine your data structure: data.head() Check for missing values: data.isnull().sum() Handle missing values with appropriate techniques Convert categorical features to numeric
DOWNLOAD DATA
CODE SNIPPET 1
CODE SNIPPET 2
CODE SNIPPET 3
INDEX
FEATURE SELECTION + ENGINEERING
NEXT PAGE
Examine correlation between features and target Visualize key relationships Create new meaningful features Select the most relevant features
CODE SNIPPET 1
CODE SNIPPET 2
CODE SNIPPET 3
CODE SNIPPET 4
INDEX
CHOOSING YOUR MODEL
NEXT PAGE
For our housing price prediction (regression task), we'll compare:
- Linear Regression
- Random Forest
- Gradient Boosting
LR
RF
XGB
CODE SNIPPET 1
CODE SNIPPET 2
INDEX
TRAINING your model
NEXT PAGE
Train each model on your prepared data Monitor training process (for supported models) Check training completion and model details
CODE SNIPPET 1
CODE SNIPPET 2
CODE SNIPPET 3
INDEX
EVALUATION METRICS
NEXT PAGE
Make predictions on test data Calculate regression metrics Visualize predictions vs. actual values
CODE SNIPPET 1
CODE SNIPPET 2
CODE SNIPPET 3
INDEX
IMPROVING YOUR MODEL
NEXT PAGE
Prevent overfitting with cross-validation Tune hyperparameters to improve performance Retrain with optimal parameters
CODE SNIPPET 1
CODE SNIPPET 2
CODE SNIPPET 3
INDEX
MAKING PREDICTIONS
NEXT PAGE
Prepare a function to make new predictions Use your model for predictions Save your model for future use
CODE SNIPPET 1
CODE SNIPPET 2
CODE SNIPPET 3
INDEX
MODEL DEPLOYMENT
ASSESSMENT
CODE SNIPPET 1
Create a simple Flask API to serve predictions Test your API with a sample request Monitor your model in production
CODE SNIPPET 2
CODE SNIPPET 3
ASSESSMENT
LET'S GO
NEXT STEPS
Learning Resources:
- Online courses: Fast.ai, Coursera Machine Learning Specialization
- Websites: Kaggle competitions, Towards Data Science blog
- Documentation: scikit-learn, pandas, and matplotlib references
- Classification project: Customer churn prediction
- Time series project: Stock price forecasting
- NLP project: Sentiment analysis of product reviews
- Progression from regression to classification problems
- Suggested skill development roadmap
- More advanced housing price models to try next
from sklearn.linear_model import LinearRegression from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor # Initialize models linear_model = LinearRegression() forest_model = RandomForestRegressor(n_estimators=100, random_state=42) boost_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
from flask import Flask, request, jsonifyimport joblib app = Flask(__name__) # Load the model model = joblib.load('housing_price_model.pkl') @app.route('/predict', methods=['POST']) def predict(): # Get data from request data = request.get_json() # Make prediction price = predict_house_price( model=model, bedrooms=data['bedrooms'], bathrooms=data['bathrooms'], square_feet=data['square_feet'], year_built=data['year_built'], neighborhood=data['neighborhood'] ) # Return prediction return jsonify({'predicted_price': price}) if __name__ == '__main__': app.run(debug=True)
import requestsimport json # Test data test_house = { 'bedrooms': 4, 'bathrooms': 2.5, 'square_feet': 2200, 'year_built': 2010, 'neighborhood': 'downtown' } # Send request to API response = requests.post('http://localhost:5000/predict', json=test_house) # Print response result = response.json() print(f"API response: ${result['predicted_price']:.2f}")
plt.figure(figsize=(10, 6))plt.scatter(y_test, forest_pred, alpha=0.5) plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'r--') plt.xlabel("Actual Prices") plt.ylabel("Predicted Prices") plt.title("Random Forest: Predicted vs. Actual Housing Prices") plt.show()
from sklearn.model_selection import cross_val_score # 5-fold cross-validation cv_scores = cross_val_score(forest_model, X, y, cv=5, scoring='neg_mean_absolute_error') # Convert negative MAE to positive and print results mae_scores = -cv_scores print(f"Cross-validation MAE scores: {mae_scores}") print(f"Average MAE: ${mae_scores.mean():.2f}")
# Predict price for a specific houseprice = predict_house_price( model=final_model, bedrooms=3, bathrooms=2, square_feet=1800, year_built=2005, neighborhood='suburb' ) print(f"Predicted house price: ${price:.2f}")
# Add logging to your APIimport logging logging.basicConfig(filename='model_predictions.log', level=logging.INFO) @app.route('/predict', methods=['POST']) def predict(): # Get data from request data = request.get_json() # Make prediction price = predict_house_price( model=model, bedrooms=data['bedrooms'], bathrooms=data['bathrooms'], square_feet=data['square_feet'], year_built=data['year_built'], neighborhood=data['neighborhood'] ) # Log prediction logging.info(f"Input: {data}, Prediction: {price}") # Return prediction return jsonify({'predicted_price': price})
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_scoreimport numpy as np # Calculate metrics for each model def evaluate_model(model_name, predictions, actual): mae = mean_absolute_error(actual, predictions) rmse = np.sqrt(mean_squared_error(actual, predictions)) r2 = r2_score(actual, predictions) print(f"{model_name} Performance:") print(f"MAE: ${mae:.2f}") print(f"RMSE: ${rmse:.2f}") print(f"R² Score: {r2:.4f}") print("-" * 30) evaluate_model("Linear Regression", linear_pred, y_test) evaluate_model("Random Forest", forest_pred, y_test) evaluate_model("Gradient Boosting", boost_pred, y_test)
# For Gradient Boosting with early stoppingfrom sklearn.model_selection import train_test_split X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2) boost_model = GradientBoostingRegressor(n_esti mators=500, validation_fraction=0.2, n_iter_no_change=5, random_state=42) boost_model.fit(X_train, y_train)
from sklearn.model_selection import GridSearchCV # Define parameter grid param_grid = { 'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10] } # Set up grid search grid_search = GridSearchCV( RandomForestRegressor(random_state=42), param_grid=param_grid, cv=3, scoring='neg_mean_absolute_error', return_train_score=True ) # Perform search grid_search.fit(X, y) # Get best parameters print(f"Best parameters: {grid_search.best_params_}") print(f"Best MAE: ${-grid_search.best_score_:.2f}")
def predict_house_price(model, bedrooms, bathrooms, square_feet, year_built, neighborhood): # Create a DataFrame with the same structure as training data input_data = pd.DataFrame({ 'bedrooms': [bedrooms], 'bathrooms': [bathrooms], 'square_feet': [square_feet], 'property_age': [2023 - year_built], 'price_per_sqft': [0] # Will be calculated }) # Add neighborhood one-hot encoding for n in ['downtown', 'suburb', 'rural']: input_data[f'neighborhood_{n}'] = [1 if neighborhood == n else 0] # Calculate price per sqft using average from our dataset # In a real scenario, this would be calculated after prediction avg_price = 250000 # Example average price input_data['price_per_sqft'] = avg_price / input_data['square_feet'] # Select only the features used in training input_features = input_data[features] # Make prediction predicted_price = model.predict(input_features)[0] return predicted_price