ML Coding
Linear Regression
The simplest way with Math Methods. Pay attention to the numerator and denominator.
# 输入数据
X = [1, 2, 3, 4, 5]
Y = [2, 4, 5, 4, 5]
# 计算平均值
mean_x = sum(X) / len(X)
mean_y = sum(Y) / len(Y)
# 计算斜率和截距
numerator = sum((X[i] - mean_x) * (Y[i] - mean_y) for i in range(len(X)))
denominator = sum((X[i] - mean_x) ** 2 for i in range(len(X)))
slope = numerator / denominator
intercept = mean_y - slope * mean_x
# 打印结果
print("斜率:", slope)
print("截距:", intercept)
With gradient descending, it has another form and it can be used within more complicated scenario.
import random
# 数据生成
# 假设X是一个矩阵,每行表示一个样本,每列表示一个特征
X = [[i, i**2] for i in range(10)] # 两个特征的例子
Y = [2 * x[0] + 3 * x[1] + 1 + random.uniform(-1, 1) for x in X]
# 梯度下降的线性回归模型
def linear_regression(X, Y, learning_rate, epochs):
num_samples = len(X)
num_features = len(X[0])
w = [0] * num_features # 初始化权重列表
b = 0 # 初始化偏置项
for epoch in range(epochs):
dw = [0] * num_features
db = 0
for i in range(num_samples):
y_pred = sum(X[i][j] * w[j] for j in range(num_features)) + b # 预测值
for j in range(num_features):
dw[j] += -2 * X[i][j] * (Y[i] - y_pred)
db += -2 * (Y[i] - y_pred)
for j in range(num_features):
w[j] -= (dw[j] / num_samples) * learning_rate
b -= (db / num_samples) * learning_rate
return w, b
learning_rate = 0.0001
epochs = 1000
w, b = linear_regression(X, Y, learning_rate, epochs)
print(f"Linear Regression Model: Y = {w[0]}*X1 + {w[1]}*X2 + {b}")
Naive Bayes Classifier with Gaussian Density Function
The assumption is that the features are independent of each other. Why?
In real life, there are often a lot of features, and each feature has a very multiple kinds of value. Therefore, estimating the subsequent probability values through statistics becomes almost impossible, which is why it is necessary to assume independence between features.
If we don’t assume that features are independent of each other, then when we do statistics, we need to look for them in the entire feature space. such as p (not handsome, not good personality, short height, not ambitious | married), We need to find the number of people who meet all four characteristics under the conditions of marriage: not handsome, not good personality, short height, and not ambitious. In this way, due to the sparsity of the data, it is easy to count them as 0. This is not appropriate.
import math
# 训练数据
X_train = [[5.1, 3.5], [4.9, 3.0], [5.8, 2.6], [6.0, 3.4], [6.7, 3.1]]
Y_train = ["A", "A", "B", "B", "C"]
# 预测数据
X_test = [5.2, 3.4]
def gaussian_density(x, mean, std_dev):
if std_dev == 0:
# Set a small pseudo variance to avoid division by zero
pseudo_variance = 0.0001
else:
pseudo_variance = std_dev
exponent = math.exp(-((x - mean) ** 2) / (2 * pseudo_variance ** 2))
return (1 / (math.sqrt(2 * math.pi) * pseudo_variance)) * exponent
def naive_bayes(X_train, Y_train, X_test):
classes = list(set(Y_train))
class_probabilities = {}
for cls in classes:
class_indices = [i for i, y in enumerate(Y_train) if y == cls]
class_data = [X_train[i] for i in class_indices]
class_prob = len(class_data) / len(X_train)
feature_probabilities = [1.0] * len(X_test)
# traversal the feature in one test sample
for i in range(len(X_test)):
# extract the i-th feature value of all samples with the same label
feature_values = [x[i] for x in class_data]
mean = sum(feature_values) / len(feature_values)
std_dev = math.sqrt(sum([(x - mean) ** 2 for x in feature_values]) / len(feature_values))
# it means that in Normal Distribution with a mean of mu and a std_dev of sigma,
# the probability density at x equals 2 is 0.888.
feature_probabilities[i] = gaussian_density(X_test[i], mean, std_dev)
class_probabilities[cls ] = class_prob * math.prod(feature_probabilities)
predicted_class = max(class_probabilities, key=class_probabilities.get)
return predicted_class
predicted_class = naive_bayes(X_train, Y_train, X_test)
print(f"Predicted Class: {predicted_class}")
k-Nearest Neighbors (KNN) algorithm
Calculate the Euclidean Distance between a test point and all training points.
Identify the k nearest neighbors based on the calculated distances.
Assign a class label to the test point by majority vote among the k nearest neighbors.
import math
def euclidean_distance(point1, point2):
# Calculate the Euclidean distance between two points
distance = 0
for i in range(len(point1)):
distance += (point1[i] - point2[i]) ** 2
return math.sqrt(distance)
def get_neighbors(x_train, y_train, x_test, k):
neighbors = []
for i in range(len(x_train)):
dist = euclidean_distance(x_train[i], x_test)
neighbors.append((y_train[i], dist))
neighbors.sort(key=lambda x: x[1]) # Sort by distance
return neighbors[:k] # Get the k nearest neighbors
def predict_class(x_train, y_train, x_test, k):
predictions = []
for test_point in x_test:
# Get the k nearest neighbors for the test point
neighbors = get_neighbors(x_train, y_train, test_point, k)
# Count the occurrences of each class label among the neighbors
class_counts = {}
for neighbor in neighbors:
label = neighbor[0]
if label in class_counts:
class_counts[label] += 1
else:
class_counts[label] = 1
# Find the class label with the highest count
max_count = max(class_counts.values())
predicted_label = min([label for label, count in class_counts.items() if count == max_count])
# predicted_label = max(class_counts, key = class_counts.get)
predictions.append(predicted_label)
return predictions
def solution(x_train, y_train, x_test, k):
# Initialize the list of predicted class labels
predicted_labels = predict_class(x_train, y_train, x_test, k)
return predicted_labels
With input and label, we can get this output.
x_train = [[-2.6, 1.9, 2.0, 1.0], [-2.8, 1.7, -1.2, 1.5], [2.0, -0.9, 0.3, 2.3], [-1.5, -0.1, -1.6, -1.1], [-1.0, -0.6, -1.2, -0.7], [-0.3, 1.2, 2.6, 0.2], [-1.8, -1.3, -0.1, -1.2], [0.2, 1.2, -0.6, -1.3], [-5.2, 0.3, 0.2, 2.2], [-0.8, -0.1, 1.5, -0.1], [-2.3, 0.3, 0.8, 0.7], [0.2, 3.0, 3.6, -0.9], [1.7, -0.8, -0.0, 2.0], [2.8, 0.8, 1.8, -0.7]]
y_train = [1, 2, 0, 0, 0, 1, 0, 1, 2, 0, 2, 1, 0, 2]
x_test = [[-0.1, 1.4, 0.4, -1.0], [-1.3, 0.2, -1.3, -0.8], [-1.1, 1.5, -2.3, -2.5], [0.2, 2.0, -0.1, -0.8], [-0.3, -1.6, -3.4, -1.4]]
k = 3
predicted_labels = solution(x_train, y_train, x_test, k)
print(predicted_labels) # Output: [1, 0, 0, 1, 0]
K-means algorithm
Assign each data point to its nearest centroid. Average the data points in each cluster to update the centroids’ locations and repeat for a set number of iterations, then assign each data point to its nearest updated centroid.
import math
def euclidean_distance(point1, point2):
# Calculate the Euclidean distance between two points
distance = 0
for i in range(len(point1)):
distance += (point1[i] - point2[i]) ** 2
return math.sqrt(distance)
def assign_to_clusters(data, centroids):
# Assign each data point to the nearest centroid
cluster_labels = []
for point in data:
distances = [euclidean_distance(point, centroid) for centroid in centroids]
closest_centroid = distances.index(min(distances))
cluster_labels.append(closest_centroid)
return cluster_labels
def update_centroids(data, cluster_labels, k):
# Update centroids by taking the mean of data points in each cluster
new_centroids = []
for i in range(k):
cluster_points = []
for j in range(len(data)):
if cluster_labels[j] == i:
cluster_points.append(data[j])
# cluster_points = [data[j] for j in range(len(data)) if cluster_labels[j] == i]
if cluster_points:
new_centroid = []
for t in range(len(cluster_points)):
total = 0
for s in range(len(cluster_points[0])):
total += cluster_points[t][s]
new_centroid.append(total / len(cluster_points))
# new_centroid = [sum(p[i] for p in cluster_points) / len(cluster_points) for i in
# range(len(cluster_points[0]))]
new_centroids.append(new_centroid)
return new_centroids
def solution(data, k, centroids, iterations):
for _ in range(iterations):
cluster_labels = assign_to_clusters(data, centroids)
centroids = update_centroids(data, cluster_labels, k)
return cluster_labels
# Test the function with your provided data
data = [[2.0, 3.0], [-3.0, 2.0], [3.0, 3.0], [-3.0, 5.0], [-1.0, -1.0], [-2.0, -2.0], [-1.0, -2.0], [3.0, -2.0],
[4.0, 5.0], [2.0, -2.0], [3.0, 2.0], [3.0, 1.0]]
k = 4
centroids = [[2.0, 3.0], [-3.0, 4.0], [4.0, -5.0], [-1.0, -1.0]]
iterations = 1
result = solution(data, k, centroids, iterations)
print(result)