ML Coding

Python

Publish Date: 2023-10-17

Update Date: 2023-10-22

Read Count:

Linear Regression

The simplest way with Math Methods. Pay attention to the numerator and denominator.

# 输入数据
X = [1, 2, 3, 4, 5]
Y = [2, 4, 5, 4, 5]

# 计算平均值
mean_x = sum(X) / len(X)
mean_y = sum(Y) / len(Y)

# 计算斜率和截距
numerator = sum((X[i] - mean_x) * (Y[i] - mean_y) for i in range(len(X)))
denominator = sum((X[i] - mean_x) ** 2 for i in range(len(X)))
slope = numerator / denominator
intercept = mean_y - slope * mean_x

# 打印结果
print("斜率:", slope)
print("截距:", intercept)

With gradient descending, it has another form and it can be used within more complicated scenario.


import random

# 数据生成
# 假设X是一个矩阵，每行表示一个样本，每列表示一个特征
X = [[i, i**2] for i in range(10)]  # 两个特征的例子
Y = [2 * x[0] + 3 * x[1] + 1 + random.uniform(-1, 1) for x in X]

# 梯度下降的线性回归模型
def linear_regression(X, Y, learning_rate, epochs):
    num_samples = len(X)
    num_features = len(X[0])
    w = [0] * num_features  # 初始化权重列表
    b = 0  # 初始化偏置项

    for epoch in range(epochs):
        dw = [0] * num_features
        db = 0

        for i in range(num_samples):
            y_pred = sum(X[i][j] * w[j] for j in range(num_features)) + b  # 预测值
            for j in range(num_features):
                dw[j] += -2 * X[i][j] * (Y[i] - y_pred)
            db += -2 * (Y[i] - y_pred)

        for j in range(num_features):
            w[j] -= (dw[j] / num_samples) * learning_rate
        b -= (db / num_samples) * learning_rate

    return w, b

learning_rate = 0.0001
epochs = 1000
w, b = linear_regression(X, Y, learning_rate, epochs)

print(f"Linear Regression Model: Y = {w[0]}*X1 + {w[1]}*X2 + {b}")

Naive Bayes Classifier with Gaussian Density Function

The assumption is that the features are independent of each other. Why?

In real life, there are often a lot of features, and each feature has a very multiple kinds of value. Therefore, estimating the subsequent probability values through statistics becomes almost impossible, which is why it is necessary to assume independence between features.
If we don’t assume that features are independent of each other, then when we do statistics, we need to look for them in the entire feature space. such as p (not handsome, not good personality, short height, not ambitious | married), We need to find the number of people who meet all four characteristics under the conditions of marriage: not handsome, not good personality, short height, and not ambitious. In this way, due to the sparsity of the data, it is easy to count them as 0. This is not appropriate.


import math

# 训练数据
X_train = [[5.1, 3.5], [4.9, 3.0], [5.8, 2.6], [6.0, 3.4], [6.7, 3.1]]
Y_train = ["A", "A", "B", "B", "C"]

# 预测数据
X_test = [5.2, 3.4]

def gaussian_density(x, mean, std_dev):
    if std_dev == 0:
        # Set a small pseudo variance to avoid division by zero
        pseudo_variance = 0.0001
    else:
        pseudo_variance = std_dev
    exponent = math.exp(-((x - mean) ** 2) / (2 * pseudo_variance ** 2))
    return (1 / (math.sqrt(2 * math.pi) * pseudo_variance)) * exponent

def naive_bayes(X_train, Y_train, X_test):
    classes = list(set(Y_train))
    class_probabilities = {}
    for cls in classes:
        class_indices = [i for i, y in enumerate(Y_train) if y == cls]
        class_data = [X_train[i] for i in class_indices]
        class_prob = len(class_data) / len(X_train)
        feature_probabilities = [1.0] * len(X_test)
        # traversal the feature in one test sample
        for i in range(len(X_test)):
            # extract the i-th feature value of all samples with the same label
            feature_values = [x[i] for x in class_data]
            mean = sum(feature_values) / len(feature_values)
            std_dev = math.sqrt(sum([(x - mean) ** 2 for x in feature_values]) / len(feature_values))
            # it means that in Normal Distribution with a mean of mu and a std_dev of sigma,
            # the probability density at x equals 2 is 0.888.
            feature_probabilities[i] = gaussian_density(X_test[i], mean, std_dev)
        class_probabilities[cls ] = class_prob * math.prod(feature_probabilities)
    predicted_class = max(class_probabilities, key=class_probabilities.get)
    return predicted_class

predicted_class = naive_bayes(X_train, Y_train, X_test)
print(f"Predicted Class: {predicted_class}")

k-Nearest Neighbors (KNN) algorithm

Calculate the Euclidean Distance between a test point and all training points.
Identify the k nearest neighbors based on the calculated distances.
Assign a class label to the test point by majority vote among the k nearest neighbors.

import math

def euclidean_distance(point1, point2):
    # Calculate the Euclidean distance between two points
    distance = 0
    for i in range(len(point1)):
        distance += (point1[i] - point2[i]) ** 2
    return math.sqrt(distance)

def get_neighbors(x_train, y_train, x_test, k):
    neighbors = []
    
    for i in range(len(x_train)):
        dist = euclidean_distance(x_train[i], x_test)
        neighbors.append((y_train[i], dist))
    
    neighbors.sort(key=lambda x: x[1])  # Sort by distance
    
    return neighbors[:k]  # Get the k nearest neighbors

def predict_class(x_train, y_train, x_test, k):
    predictions = []
    
    for test_point in x_test:
        # Get the k nearest neighbors for the test point
        neighbors = get_neighbors(x_train, y_train, test_point, k)
        
        # Count the occurrences of each class label among the neighbors
        class_counts = {}
        for neighbor in neighbors:
            label = neighbor[0]
            if label in class_counts:
                class_counts[label] += 1
            else:
                class_counts[label] = 1
        
        # Find the class label with the highest count
        max_count = max(class_counts.values())
        predicted_label = min([label for label, count in class_counts.items() if count == max_count])
        
        # predicted_label = max(class_counts, key = class_counts.get)
        predictions.append(predicted_label)
    
    return predictions

def solution(x_train, y_train, x_test, k):
    # Initialize the list of predicted class labels
    predicted_labels = predict_class(x_train, y_train, x_test, k)
    
    return predicted_labels

With input and label, we can get this output.

x_train = [[-2.6, 1.9, 2.0, 1.0], [-2.8, 1.7, -1.2, 1.5], [2.0, -0.9, 0.3, 2.3], [-1.5, -0.1, -1.6, -1.1], [-1.0, -0.6, -1.2, -0.7], [-0.3, 1.2, 2.6, 0.2], [-1.8, -1.3, -0.1, -1.2], [0.2, 1.2, -0.6, -1.3], [-5.2, 0.3, 0.2, 2.2], [-0.8, -0.1, 1.5, -0.1], [-2.3, 0.3, 0.8, 0.7], [0.2, 3.0, 3.6, -0.9], [1.7, -0.8, -0.0, 2.0], [2.8, 0.8, 1.8, -0.7]]
y_train = [1, 2, 0, 0, 0, 1, 0, 1, 2, 0, 2, 1, 0, 2]
x_test = [[-0.1, 1.4, 0.4, -1.0], [-1.3, 0.2, -1.3, -0.8], [-1.1, 1.5, -2.3, -2.5], [0.2, 2.0, -0.1, -0.8], [-0.3, -1.6, -3.4, -1.4]]
k = 3

predicted_labels = solution(x_train, y_train, x_test, k)
print(predicted_labels)  # Output: [1, 0, 0, 1, 0]

K-means algorithm

Assign each data point to its nearest centroid. Average the data points in each cluster to update the centroids’ locations and repeat for a set number of iterations, then assign each data point to its nearest updated centroid.

import math
def euclidean_distance(point1, point2):
    # Calculate the Euclidean distance between two points
    distance = 0
    for i in range(len(point1)):
        distance += (point1[i] - point2[i]) ** 2
    return math.sqrt(distance)


def assign_to_clusters(data, centroids):
    # Assign each data point to the nearest centroid
    cluster_labels = []
    for point in data:
        distances = [euclidean_distance(point, centroid) for centroid in centroids]
        closest_centroid = distances.index(min(distances))
        cluster_labels.append(closest_centroid)
    return cluster_labels


def update_centroids(data, cluster_labels, k):
    # Update centroids by taking the mean of data points in each cluster
    new_centroids = []
    for i in range(k):
        cluster_points = []
        for j in range(len(data)):
            if cluster_labels[j] == i:
                cluster_points.append(data[j])
        # cluster_points = [data[j] for j in range(len(data)) if cluster_labels[j] == i]
        if cluster_points:
            new_centroid = []
            for t in range(len(cluster_points)):
                total = 0
                for s in range(len(cluster_points[0])):
                    total += cluster_points[t][s]
                new_centroid.append(total / len(cluster_points))
            # new_centroid = [sum(p[i] for p in cluster_points) / len(cluster_points) for i in
            #                 range(len(cluster_points[0]))]
            new_centroids.append(new_centroid)
    return new_centroids


def solution(data, k, centroids, iterations):
    for _ in range(iterations):
        cluster_labels = assign_to_clusters(data, centroids)
        centroids = update_centroids(data, cluster_labels, k)

    return cluster_labels


# Test the function with your provided data
data = [[2.0, 3.0], [-3.0, 2.0], [3.0, 3.0], [-3.0, 5.0], [-1.0, -1.0], [-2.0, -2.0], [-1.0, -2.0], [3.0, -2.0],
        [4.0, 5.0], [2.0, -2.0], [3.0, 2.0], [3.0, 1.0]]
k = 4
centroids = [[2.0, 3.0], [-3.0, 4.0], [4.0, -5.0], [-1.0, -1.0]]
iterations = 1

result = solution(data, k, centroids, iterations)
print(result)