Skip to content
Shop

Don't be naive

Naive Bayes (Simplified)

Imagine classifying emails as spam or not spam based on the presence of certain words. Here's a simplified approach:

python
def classify_email(email_text, spam_words, ham_words):
  # Calculate word counts
  spam_count = sum(word in email_text for word in spam_words)
  ham_count = sum(word in email_text for word in ham_words)

  # Calculate probabilities (assuming equal priors for spam and ham)
  p_spam = spam_count / (spam_count + ham_count)
  p_ham = ham_count / (spam_count + ham_count)

  # Classify based on higher probability
  return "spam" if p_spam > p_ham else "ham"

# Example usage
spam_words = ["buy", "viagra", "free"]
ham_words = ["hello", "meeting", "lunch"]
email_text = "This email is about a free meeting."
print(classify_email(email_text, spam_words, ham_words))

KNN (Simplified)

Imagine having a dataset of points with features (e.g., weight, height) and their corresponding labels (e.g., "apple", "banana"). Here's a simplified way to find the nearest neighbor for a new point:

python
def find_nearest_neighbor(data, target_point, k):
  # Calculate distances between target point and each data point
  distances = [calculate_distance(target_point, point) for point in data]

  # Sort data points based on distances (ascending order)
  sorted_data = sorted(zip(data, distances), key=lambda x: x[1])

  # Return the label of the first K nearest neighbors
  return [point[0] for point in sorted_data[:k]]

# Example usage (assuming a distance function is defined)
data = [
  {"features": [5, 3], "label": "apple"},
  {"features": [2, 7], "label": "banana"},
  {"features": [7, 2], "label": "apple"},
]
target_point = {"features": [4, 5]}
k = 2
nearest_neighbors = find_nearest_neighbor(data, target_point, k)
print(nearest_neighbors)  # Might print "[{'features': [5, 3], 'label': 'apple'}, {'features': [7, 2], 'label': 'apple'}]"

K-Means (Simplified)

Imagine having a dataset of points with features (like color values) and wanting to group them into 2 clusters. Here's a simplified way to perform basic clustering:

python
def k_means(data, k):
  # Randomly choose initial centroids
  centroids = [data[i] for i in range(k)]

  # Iterate until centroids stabilize
  while True:
    # Assign data points to their nearest centroid
    clusters = {}
    for point in data:
      min_distance = float("inf")
      closest_centroid = None
      for centroid in centroids:
        distance = calculate_distance(point, centroid)
        if distance < min_distance:
          min_distance = distance
          closest_centroid = centroid
      clusters.setdefault(closest_centroid, []).append(point)

    # Update centroids based on cluster means
    new_centroids = []
    for cluster, points in clusters.items():
      if points:
        new_centroid = [sum(feature for point in points) / len(points) for feature in zip(*points)]
        new_centroids.append(new_centroid)

    # Check if centroids haven't changed
    if all(new_centroid == old_centroid for new_centroid, old_centroid in zip(new_centroids, centroids)):
      return clusters

# Example usage (assuming a distance function is defined)
data = [
  {"features": [1, 1]},
  {"features": [5, 5]},
  {"features": [1, 5]},
  {"features": [5, 1]},
]
k = 2
clusters = k_