Don't be naive
Naive Bayes (Simplified)
Imagine classifying emails as spam or not spam based on the presence of certain words. Here's a simplified approach:
python
def classify_email(email_text, spam_words, ham_words):
# Calculate word counts
spam_count = sum(word in email_text for word in spam_words)
ham_count = sum(word in email_text for word in ham_words)
# Calculate probabilities (assuming equal priors for spam and ham)
p_spam = spam_count / (spam_count + ham_count)
p_ham = ham_count / (spam_count + ham_count)
# Classify based on higher probability
return "spam" if p_spam > p_ham else "ham"
# Example usage
spam_words = ["buy", "viagra", "free"]
ham_words = ["hello", "meeting", "lunch"]
email_text = "This email is about a free meeting."
print(classify_email(email_text, spam_words, ham_words))
KNN (Simplified)
Imagine having a dataset of points with features (e.g., weight, height) and their corresponding labels (e.g., "apple", "banana"). Here's a simplified way to find the nearest neighbor for a new point:
python
def find_nearest_neighbor(data, target_point, k):
# Calculate distances between target point and each data point
distances = [calculate_distance(target_point, point) for point in data]
# Sort data points based on distances (ascending order)
sorted_data = sorted(zip(data, distances), key=lambda x: x[1])
# Return the label of the first K nearest neighbors
return [point[0] for point in sorted_data[:k]]
# Example usage (assuming a distance function is defined)
data = [
{"features": [5, 3], "label": "apple"},
{"features": [2, 7], "label": "banana"},
{"features": [7, 2], "label": "apple"},
]
target_point = {"features": [4, 5]}
k = 2
nearest_neighbors = find_nearest_neighbor(data, target_point, k)
print(nearest_neighbors) # Might print "[{'features': [5, 3], 'label': 'apple'}, {'features': [7, 2], 'label': 'apple'}]"
K-Means (Simplified)
Imagine having a dataset of points with features (like color values) and wanting to group them into 2 clusters. Here's a simplified way to perform basic clustering:
python
def k_means(data, k):
# Randomly choose initial centroids
centroids = [data[i] for i in range(k)]
# Iterate until centroids stabilize
while True:
# Assign data points to their nearest centroid
clusters = {}
for point in data:
min_distance = float("inf")
closest_centroid = None
for centroid in centroids:
distance = calculate_distance(point, centroid)
if distance < min_distance:
min_distance = distance
closest_centroid = centroid
clusters.setdefault(closest_centroid, []).append(point)
# Update centroids based on cluster means
new_centroids = []
for cluster, points in clusters.items():
if points:
new_centroid = [sum(feature for point in points) / len(points) for feature in zip(*points)]
new_centroids.append(new_centroid)
# Check if centroids haven't changed
if all(new_centroid == old_centroid for new_centroid, old_centroid in zip(new_centroids, centroids)):
return clusters
# Example usage (assuming a distance function is defined)
data = [
{"features": [1, 1]},
{"features": [5, 5]},
{"features": [1, 5]},
{"features": [5, 1]},
]
k = 2
clusters = k_