Source code for btQuant.ml

import numpy as np


[docs]
def regressionTree(X, y, maxDepth=5):
    """
    Regression tree using MSE splits.
    
    Parameters:
        X: features (n_samples, n_features)
        y: target values
        maxDepth: maximum tree depth
    
    Returns:
        dict: tree structure
    """
    def mse(y):
        return np.mean((y - np.mean(y))**2)
    
    def buildTree(X, y, depth):
        if depth == maxDepth or len(np.unique(y)) == 1 or len(y) < 2:
            return np.mean(y)
        
        bestMse = float('inf')
        bestSplit = None
        bestLeftX, bestRightX = None, None
        bestLeftY, bestRightY = None, None
        
        nFeatures = X.shape[1]
        for feature in range(nFeatures):
            uniqueVals = np.unique(X[:, feature])
            for value in uniqueVals:
                leftMask = X[:, feature] <= value
                rightMask = ~leftMask
                
                if np.sum(leftMask) == 0 or np.sum(rightMask) == 0:
                    continue
                
                leftX, rightX = X[leftMask], X[rightMask]
                leftY, rightY = y[leftMask], y[rightMask]
                
                splitMse = (mse(leftY) * len(leftY) + mse(rightY) * len(rightY)) / len(y)
                
                if splitMse < bestMse:
                    bestMse = splitMse
                    bestSplit = (feature, value)
                    bestLeftX, bestRightX = leftX, rightX
                    bestLeftY, bestRightY = leftY, rightY
        
        if bestSplit is None:
            return np.mean(y)
        
        feature, value = bestSplit
        leftTree = buildTree(bestLeftX, bestLeftY, depth + 1)
        rightTree = buildTree(bestRightX, bestRightY, depth + 1)
        
        return {'feature': feature, 'value': value, 'left': leftTree, 'right': rightTree}
    
    return buildTree(X, y, 0)



[docs]
def predictTree(tree, X):
    """
    Predict using a regression or decision tree.
    
    Parameters:
        tree: tree structure from regressionTree or decisionTree
        X: features to predict (n_samples, n_features)
    
    Returns:
        predictions array
    """
    def predictSingle(tree, x):
        while isinstance(tree, dict):
            if x[tree['feature']] <= tree['value']:
                tree = tree['left']
            else:
                tree = tree['right']
        return tree
    
    return np.array([predictSingle(tree, x) for x in X])



[docs]
def isolationForest(X, nTrees=100, maxSamples=None, maxDepth=10):
    """
    Isolation forest for anomaly detection.
    
    Parameters:
        X: features (n_samples, n_features)
        nTrees: number of trees
        maxSamples: samples per tree (default all)
        maxDepth: maximum tree depth
    
    Returns:
        list of isolation trees
    """
    def buildTree(X, depth):
        if depth >= maxDepth or len(X) <= 1:
            return {'size': len(X)}
        
        feature = np.random.randint(0, X.shape[1])
        minVal, maxVal = X[:, feature].min(), X[:, feature].max()
        
        if minVal == maxVal:
            return {'size': len(X)}
        
        splitVal = np.random.uniform(minVal, maxVal)
        
        leftMask = X[:, feature] <= splitVal
        rightMask = ~leftMask
        
        if np.sum(leftMask) == 0 or np.sum(rightMask) == 0:
            return {'size': len(X)}
        
        leftTree = buildTree(X[leftMask], depth + 1)
        rightTree = buildTree(X[rightMask], depth + 1)
        
        return {'feature': feature, 'value': splitVal, 'left': leftTree, 'right': rightTree}
    
    trees = []
    nSamples = X.shape[0]
    if maxSamples is None:
        maxSamples = min(256, nSamples)
    
    for _ in range(nTrees):
        indices = np.random.choice(nSamples, min(maxSamples, nSamples), replace=False)
        tree = buildTree(X[indices], 0)
        trees.append(tree)
    
    return trees



[docs]
def anomalyScore(trees, X):
    """
    Compute anomaly scores from isolation forest.
    
    Parameters:
        trees: list of isolation trees
        X: features to score
    
    Returns:
        anomaly scores (higher = more anomalous)
    """
    def pathLength(tree, x, currentDepth=0):
        if not isinstance(tree, dict) or 'size' in tree:
            size = tree.get('size', 1)
            if size <= 1:
                return currentDepth
            c = 2 * (np.log(size - 1) + 0.5772156649) - 2 * (size - 1) / size
            return currentDepth + c
        
        if x[tree['feature']] <= tree['value']:
            return pathLength(tree['left'], x, currentDepth + 1)
        else:
            return pathLength(tree['right'], x, currentDepth + 1)
    
    n = len(trees[0]) if isinstance(trees[0], dict) else 256
    c = 2 * (np.log(n - 1) + 0.5772156649) - 2 * (n - 1) / n
    
    scores = []
    for x in X:
        avgPath = np.mean([pathLength(tree, x) for tree in trees])
        score = 2 ** (-avgPath / c)
        scores.append(score)
    
    return np.array(scores)



[docs]
def kmeans(X, k=3, maxIters=100, tol=1e-4):
    """
    K-means clustering.
    
    Parameters:
        X: features (n_samples, n_features)
        k: number of clusters
        maxIters: maximum iterations
        tol: convergence tolerance
    
    Returns:
        centroids, labels
    """
    nSamples = X.shape[0]
    centroids = X[np.random.choice(nSamples, k, replace=False)]
    
    for iteration in range(maxIters):
        distances = np.linalg.norm(X[:, np.newaxis] - centroids, axis=2)
        labels = np.argmin(distances, axis=1)
        
        newCentroids = np.array([X[labels == i].mean(axis=0) if np.sum(labels == i) > 0 
                                 else centroids[i] for i in range(k)])
        
        if np.linalg.norm(centroids - newCentroids) < tol:
            break
        
        centroids = newCentroids
    
    return centroids, labels



[docs]
def knn(XTrain, yTrain, XTest, k=3):
    """
    K-nearest neighbors classifier.
    
    Parameters:
        XTrain: training features
        yTrain: training labels
        XTest: test features
        k: number of neighbors
    
    Returns:
        predicted labels
    """
    predictions = []
    
    for testPoint in XTest:
        distances = np.sqrt(np.sum((XTrain - testPoint)**2, axis=1))
        sortedIndices = np.argsort(distances)
        nearestNeighbors = yTrain[sortedIndices[:k]]
        mostCommon = np.bincount(nearestNeighbors.astype(int)).argmax()
        predictions.append(mostCommon)
    
    return np.array(predictions)



[docs]
def naiveBayes(XTrain, yTrain, XTest):
    """
    Gaussian naive Bayes classifier.
    
    Parameters:
        XTrain: training features
        yTrain: training labels
        XTest: test features
    
    Returns:
        predicted labels
    """
    def gaussianPdf(x, mean, std):
        eps = 1e-10
        return (1 / (std + eps) / np.sqrt(2 * np.pi)) * np.exp(-0.5 * ((x - mean) / (std + eps))**2)
    
    classes = np.unique(yTrain)
    meanStd = {}
    
    for cls in classes:
        XCls = XTrain[yTrain == cls]
        meanStd[cls] = {
            'mean': np.mean(XCls, axis=0),
            'std': np.std(XCls, axis=0) + 1e-10
        }
    
    predictions = []
    
    for testPoint in XTest:
        classProbs = {}
        
        for cls in classes:
            likelihood = np.prod(gaussianPdf(testPoint, meanStd[cls]['mean'], meanStd[cls]['std']))
            classProbs[cls] = likelihood
        
        predictedClass = max(classProbs, key=classProbs.get)
        predictions.append(predictedClass)
    
    return np.array(predictions)



[docs]
def decisionTree(X, y, maxDepth=5):
    """
    Decision tree classifier using Gini impurity.
    
    Parameters:
        X: features (n_samples, n_features)
        y: target labels
        maxDepth: maximum tree depth
    
    Returns:
        dict: tree structure
    """
    def giniImpurity(y):
        if len(y) == 0:
            return 0
        _, counts = np.unique(y, return_counts=True)
        probs = counts / len(y)
        return 1 - np.sum(probs**2)
    
    def bestSplit(X, y):
        bestGini = float('inf')
        bestSplit = None
        bestLeftX, bestRightX = None, None
        bestLeftY, bestRightY = None, None
        
        for feature in range(X.shape[1]):
            uniqueVals = np.unique(X[:, feature])
            for value in uniqueVals:
                leftMask = X[:, feature] <= value
                rightMask = ~leftMask
                
                if np.sum(leftMask) == 0 or np.sum(rightMask) == 0:
                    continue
                
                leftY, rightY = y[leftMask], y[rightMask]
                
                gini = (len(leftY) * giniImpurity(leftY) + 
                       len(rightY) * giniImpurity(rightY)) / len(y)
                
                if gini < bestGini:
                    bestGini = gini
                    bestSplit = (feature, value)
                    bestLeftX, bestRightX = X[leftMask], X[rightMask]
                    bestLeftY, bestRightY = leftY, rightY
        
        return bestSplit, bestLeftX, bestRightX, bestLeftY, bestRightY
    
    def buildTree(X, y, depth):
        if depth == maxDepth or len(np.unique(y)) == 1 or len(y) < 2:
            return int(np.bincount(y.astype(int)).argmax())
        
        split, leftX, rightX, leftY, rightY = bestSplit(X, y)
        
        if split is None:
            return int(np.bincount(y.astype(int)).argmax())
        
        feature, value = split
        
        leftTree = buildTree(leftX, leftY, depth + 1)
        rightTree = buildTree(rightX, rightY, depth + 1)
        
        return {'feature': feature, 'value': value, 'left': leftTree, 'right': rightTree}
    
    return buildTree(X, y, 0)



[docs]
def randomForest(X, y, nEstimators=10, maxDepth=5, sampleRatio=0.8):
    """
    Random forest regressor.
    
    Parameters:
        X: features
        y: target values
        nEstimators: number of trees
        maxDepth: maximum tree depth
        sampleRatio: fraction of samples per tree
    
    Returns:
        predict function
    """
    trees = []
    
    for _ in range(nEstimators):
        idx = np.random.choice(len(X), int(len(X) * sampleRatio), replace=True)
        tree = regressionTree(X[idx], y[idx], maxDepth=maxDepth)
        trees.append(tree)
    
    def predict(XNew):
        preds = np.array([predictTree(tree, XNew) for tree in trees])
        return preds.mean(axis=0)
    
    return predict



[docs]
def gradientBoosting(X, y, nEstimators=100, learningRate=0.1, maxDepth=3):
    """
    Gradient boosting regressor.
    
    Parameters:
        X: features
        y: target values
        nEstimators: number of boosting rounds
        learningRate: learning rate (shrinkage)
        maxDepth: maximum tree depth
    
    Returns:
        predict function
    """
    yPred = np.zeros_like(y, dtype=float)
    models = []
    
    for _ in range(nEstimators):
        residual = y - yPred
        tree = regressionTree(X, residual, maxDepth=maxDepth)
        models.append(tree)
        preds = predictTree(tree, X)
        yPred += learningRate * preds
    
    def predict(XNew):
        total = np.zeros(len(XNew))
        for tree in models:
            total += learningRate * predictTree(tree, XNew)
        return total
    
    return predict



[docs]
def pca(X, nComponents=None):
    """
    Principal component analysis.
    
    Parameters:
        X: features (n_samples, n_features)
        nComponents: number of components to keep
    
    Returns:
        transformed data, eigenvalues, eigenvectors
    """
    XMean = np.mean(X, axis=0)
    XCentered = X - XMean
    
    covMatrix = np.cov(XCentered, rowvar=False)
    
    eigenvalues, eigenvectors = np.linalg.eigh(covMatrix)
    
    sortedIndices = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[sortedIndices]
    eigenvectors = eigenvectors[:, sortedIndices]
    
    if nComponents is not None:
        eigenvectors = eigenvectors[:, :nComponents]
        eigenvalues = eigenvalues[:nComponents]
    
    XPca = XCentered @ eigenvectors
    
    return XPca, eigenvalues, eigenvectors



[docs]
def lda(X, y, nComponents=None):
    """
    Linear discriminant analysis.
    
    Parameters:
        X: features
        y: class labels
        nComponents: number of components
    
    Returns:
        transformed data, eigenvalues, eigenvectors
    """
    meanOverall = np.mean(X, axis=0)
    
    classes = np.unique(y)
    meanClasses = np.array([np.mean(X[y == c], axis=0) for c in classes])
    
    Sb = np.zeros((X.shape[1], X.shape[1]))
    for c, meanClass in zip(classes, meanClasses):
        nc = X[y == c].shape[0]
        meanDiff = (meanClass - meanOverall).reshape(-1, 1)
        Sb += nc * (meanDiff @ meanDiff.T)
    
    Sw = np.zeros((X.shape[1], X.shape[1]))
    for c, meanClass in zip(classes, meanClasses):
        XC = X[y == c]
        meanDiff = (XC - meanClass).T
        Sw += meanDiff @ meanDiff.T
    
    try:
        SwInv = np.linalg.inv(Sw + 1e-6 * np.eye(Sw.shape[0]))
        eigenvals, eigenvecs = np.linalg.eig(SwInv @ Sb)
    except:
        eigenvals = np.zeros(min(len(classes) - 1, X.shape[1]))
        eigenvecs = np.eye(X.shape[1], len(eigenvals))
    
    sortedIndices = np.argsort(np.abs(eigenvals))[::-1]
    eigenvals = eigenvals[sortedIndices]
    eigenvecs = eigenvecs[:, sortedIndices]
    
    if nComponents is not None:
        eigenvecs = eigenvecs[:, :nComponents]
        eigenvals = eigenvals[:nComponents]
    
    XLda = (X - meanOverall) @ eigenvecs.real
    
    return XLda, eigenvals.real, eigenvecs.real



[docs]
def logisticRegression(X, y, learningRate=0.01, nIters=1000):
    """
    Logistic regression classifier.
    
    Parameters:
        X: features
        y: binary labels (0/1)
        learningRate: learning rate
        nIters: number of iterations
    
    Returns:
        weights, bias, predict function
    """
    nSamples, nFeatures = X.shape
    weights = np.zeros(nFeatures)
    bias = 0
    
    for _ in range(nIters):
        linearModel = X @ weights + bias
        yPred = 1 / (1 + np.exp(-linearModel))
        
        dw = (1 / nSamples) * (X.T @ (yPred - y))
        db = (1 / nSamples) * np.sum(yPred - y)
        
        weights -= learningRate * dw
        bias -= learningRate * db
    
    def predict(XNew):
        linearModel = XNew @ weights + bias
        yPred = 1 / (1 + np.exp(-linearModel))
        return (yPred >= 0.5).astype(int)
    
    return weights, bias, predict