Source code for pyml.neighbors.knn

"""k-nearest neighbors (kNN) classifier"""

import numpy as np

from pyml.exceptions import ShapeError
from pyml.utils.metrics import manhatten_distance, euclidean_distance


[docs]class UnknownMetric(Exception): """Exception raised for unknown metric methods Parameters --------- metric : str Name of the metric provided by the user Examples -------- >>> from pyml.neighbors import kNNClassifier >>> model = kNNClassifier(metric='abc metric') The metric "abc metric" is not implemented. Please refer to the documentation. """ def __init__(self, metric:str) -> None: self.message = f'The metric "{metric}" is not implemented. Please refer to the documentation.' super().__init__(self.message)
[docs]class kNNClassifier(): """Classifier model using the nearest neighbor algorithm K-nearest neighbor (KNN) is a simple and intuitive machine learning algorithm, that can be used for classification and regression tasks. In the case of classification the model predicts the class of an data point based on the majority class or average of its K nearest data points in the feature space. Following metrics are support: - euclidean - manhatten Parameters ---------- k : int, optional Specifies the number of nearest neighbor to consider when predicting on new data. By default 3. metric : str, optional Specifies the metric used for calculating the distance By default 'euclidean'. Attributes ---------- metrics : List[str] Defines the metrics that are currently supported Raises ------ UnknownMetric Raised when using an unknow metric name (including spelling errors) ShapeError Raised when computing the distance for incompatible matrices """ metrics = ['euclidean', 'manhatten'] def __init__(self, k:int=3, metric:str='euclidean') -> None: self.k = k if metric not in self.metrics: raise UnknownMetric(metric) else: self.metric = metric
[docs] def _compute_distance( self, x1: np.ndarray, x2: np.ndarray ) -> np.array: """Computes the distance between two matrix-like objects using the defined metric One of the parameters must be a matrix with only one row or alternativly a vector. Parameters ---------- x1 : numpy.ndarray Input matrix x2 : numpy.ndarray Input matrix Returns ------- numpy.ndarray Matrix consisting of the distances Raises ------ ShapeError If shapes do not match a shape error See Also -------- pyml.exceptions.ShapeError """ if self.metric == 'euclidean': dist_func = euclidean_distance elif self.metric == 'manhatten': dist_func = manhatten_distance try: distance = dist_func(x1, x2) except: raise ShapeError(x1.shape, x2.shape) return distance
[docs] def fit( self, X:np.ndarray, y:np.array ) -> None: """Fit model on training data Since the k nearest neighbor algorithm is a lazy learner, there will be no training. However, the training data will be stored in memory. Parameters ---------- X : numpy.ndarray Input training data y : numpy.array Input training labels """ self.X = np.atleast_2d(X) self.y = y
[docs] def predict( self, X:np.ndarray, return_class_prob:bool=False ) -> np.array: """Calculates predictions for given data points Parameters ---------- X : numpy.ndarray Input matrix; for each row the k nearest neighbor is being calculated return_class_prob : bool, optional If set to true, the respective probability of each prediction is be returned as well (#predicted_class / k). By default False. Returns ------- numpy.ndarray Returns predicted labels and if specified their respective probability. """ # Add one dimension in case input matrix is of shape (n, ) X = np.atleast_2d(X) # Compute distances for each input entry point distances = np.apply_along_axis(func1d = self._compute_distance, axis=1, arr=X, x2=self.X) # Get indices of datapoints with shortest distance indices = np.apply_along_axis(func1d = np.argsort, axis=1, arr=distances) # Get the respective classes nearest_k_classes = self.y[indices[:, :self.k]] # Retrieve the frequency of the nearest classes axis = 1 classes, indices = np.unique(nearest_k_classes, return_inverse=True) class_predictions = classes[np.argmax(np.apply_along_axis(np.bincount, axis, indices.reshape(nearest_k_classes.shape), None, np.max(indices) + 1), axis=axis)] if return_class_prob: frequencies = np.empty(X.shape[0], dtype=float) for i, row in enumerate(X): _, counts = np.unique(row, return_counts=True) frequencies[i] = counts.max() / self.k return class_predictions, frequencies return class_predictions