Source code for pyml.neighbors.knn

"""k-nearest neighbors (kNN) classifier"""

import numpy as np

from pyml.exceptions import ShapeError
from pyml.utils.metrics import manhatten_distance, euclidean_distance


[docs]class UnknownMetric(Exception):
    """Exception raised for unknown metric methods
    
    Parameters
    ---------
    metric : str
        Name of the metric provided by the user

    Examples
    --------
    >>> from pyml.neighbors import kNNClassifier
    >>> model = kNNClassifier(metric='abc metric')
    The metric "abc metric" is not implemented. Please refer to the documentation.
    """

    def __init__(self, metric:str) -> None:
        self.message = f'The metric "{metric}" is not implemented. Please refer to the documentation.'
        super().__init__(self.message)

[docs]class kNNClassifier():
    """Classifier model using the nearest neighbor algorithm

    K-nearest neighbor (KNN) is a simple and intuitive machine learning algorithm,
    that can be used for classification and regression tasks.
    In the case of classification the model predicts the class of an data point based
    on the majority class or average of its K nearest data points in the feature space.

    Following metrics are support:
    - euclidean
    - manhatten

    Parameters
    ----------
    k : int, optional
        Specifies the number of nearest neighbor to consider when predicting on new data.
        By default 3.
    metric : str, optional
        Specifies the metric used for calculating the distance
        By default 'euclidean'.

    Attributes
    ----------
    metrics : List[str]
        Defines the metrics that are currently supported
    
    Raises
    ------
    UnknownMetric
        Raised when using an unknow metric name (including spelling errors)
    ShapeError
        Raised when computing the distance for incompatible matrices
    """

    metrics = ['euclidean', 'manhatten']

    def __init__(self, k:int=3, metric:str='euclidean') -> None:
        
        self.k = k

        if metric not in self.metrics:
            raise UnknownMetric(metric)
        else:
            self.metric = metric

[docs]    def _compute_distance(
            self,
            x1: np.ndarray,
            x2: np.ndarray
    ) -> np.array:
        """Computes the distance between two matrix-like objects using the defined metric

        One of the parameters must be a matrix with only one row or alternativly a vector.

        Parameters
        ----------
        x1 : numpy.ndarray
            Input matrix
        x2 : numpy.ndarray
            Input matrix

        Returns
        -------
        numpy.ndarray
            Matrix consisting of the distances

        Raises
        ------
        ShapeError
            If shapes do not match a shape error

        See Also
        --------
        pyml.exceptions.ShapeError
        """
        
        if self.metric == 'euclidean':
            dist_func = euclidean_distance
        elif self.metric == 'manhatten':
            dist_func = manhatten_distance
        
        try:
            distance = dist_func(x1, x2)
        except:
            raise ShapeError(x1.shape, x2.shape)
            
        return distance
    
[docs]    def fit(
            self,
            X:np.ndarray,
            y:np.array
    ) -> None:
        """Fit model on training data

        Since the k nearest neighbor algorithm is a lazy learner, there will be no training.
        However, the training data will be stored in memory.

        Parameters
        ----------
        X : numpy.ndarray
            Input training data
        y : numpy.array
            Input training labels
        """
        
        self.X = np.atleast_2d(X)
        self.y = y

[docs]    def predict(
            self,
            X:np.ndarray,
            return_class_prob:bool=False
    ) -> np.array:
        """Calculates predictions for given data points

        Parameters
        ----------
        X : numpy.ndarray
            Input matrix; for each row the k nearest neighbor is being calculated
        return_class_prob : bool, optional
            If set to true, the respective probability of each prediction is be returned as well
            (#predicted_class / k).
            By default False.

        Returns
        -------
        numpy.ndarray
            Returns predicted labels and if specified their respective probability.
        """

        # Add one dimension in case input matrix is of shape (n, )
        X = np.atleast_2d(X)
        
        # Compute distances for each input entry point
        distances = np.apply_along_axis(func1d = self._compute_distance, axis=1, arr=X, x2=self.X)

        # Get indices of datapoints with shortest distance
        indices = np.apply_along_axis(func1d = np.argsort, axis=1, arr=distances)

        # Get the respective classes
        nearest_k_classes = self.y[indices[:, :self.k]]

        # Retrieve the frequency of the nearest classes
        axis = 1
        classes, indices = np.unique(nearest_k_classes, return_inverse=True)
        class_predictions = classes[np.argmax(np.apply_along_axis(np.bincount, axis, indices.reshape(nearest_k_classes.shape), None, np.max(indices) + 1), axis=axis)]

        if return_class_prob:
            frequencies = np.empty(X.shape[0], dtype=float)
            for i, row in enumerate(X):
                _, counts = np.unique(row, return_counts=True)
                frequencies[i] = counts.max() / self.k

            return class_predictions, frequencies
        
        return class_predictions