Add describe functions (#77)

Closes issue #65 * added describe feature (pandas.describe) * Update README.md --------- Co-authored-by: Montana Flynn <montana949@gmail.com>
montanaflynn · May 11, 2023 · b9dad85 · b9dad85
1 parent a145605
commit b9dad85
Show file tree

Hide file tree

Showing 5 changed files with 184 additions and 12 deletions.
diff --git a/README.md b/README.md
@@ -76,6 +76,8 @@ func Correlation(data1, data2 Float64Data) (float64, error) {}
 func Covariance(data1, data2 Float64Data) (float64, error) {}
 func CovariancePopulation(data1, data2 Float64Data) (float64, error) {}
 func CumulativeSum(input Float64Data) ([]float64, error) {}
+func Describe(input Float64Data, allowNaN bool, percentiles *[]float64) (*Description, error) {}
+func DescribePercentileFunc(input Float64Data, allowNaN bool, percentiles *[]float64, percentileFunc func(Float64Data, float64) (float64, error)) (*Description, error) {}
 func Entropy(input Float64Data) (float64, error) {}
 func EuclideanDistance(dataPointX, dataPointY Float64Data) (distance float64, err error) {}
 func GeometricMean(input Float64Data) (float64, error) {}

diff --git a/describe.go b/describe.go
@@ -0,0 +1,81 @@
+package stats
+
+import "fmt"
+
+// Holds information about the dataset provided to Describe
+type Description struct {
+	Count                  int
+	Mean                   float64
+	Std                    float64
+	Max                    float64
+	Min                    float64
+	DescriptionPercentiles []descriptionPercentile
+	AllowedNaN             bool
+}
+
+// Specifies percentiles to be computed
+type descriptionPercentile struct {
+	Percentile float64
+	Value      float64
+}
+
+// Describe generates descriptive statistics about a provided dataset, similar to python's pandas.describe()
+func Describe(input Float64Data, allowNaN bool, percentiles *[]float64) (*Description, error) {
+	return DescribePercentileFunc(input, allowNaN, percentiles, Percentile)
+}
+
+// Describe generates descriptive statistics about a provided dataset, similar to python's pandas.describe()
+// Takes in a function to use for percentile calculation
+func DescribePercentileFunc(input Float64Data, allowNaN bool, percentiles *[]float64, percentileFunc func(Float64Data, float64) (float64, error)) (*Description, error) {
+	var description Description
+	description.AllowedNaN = allowNaN
+	description.Count = input.Len()
+
+	if description.Count == 0 && !allowNaN {
+		return &description, ErrEmptyInput
+	}
+
+	// Disregard error, since it cannot be thrown if Count is > 0 and allowNaN is false, else NaN is accepted
+	description.Std, _ = StandardDeviation(input)
+	description.Max, _ = Max(input)
+	description.Min, _ = Min(input)
+	description.Mean, _ = Mean(input)
+
+	if percentiles != nil {
+		for _, percentile := range *percentiles {
+			if value, err := percentileFunc(input, percentile); err == nil || allowNaN {
+				description.DescriptionPercentiles = append(description.DescriptionPercentiles, descriptionPercentile{Percentile: percentile, Value: value})
+			}
+		}
+	}
+
+	return &description, nil
+}
+
+/*
+Represents the Description instance in a string format with specified number of decimals
+
+	count   3
+	mean    2.00
+	std     0.82
+	max     3.00
+	min     1.00
+	25.00%  NaN
+	50.00%  1.50
+	75.00%  2.50
+	NaN OK  true
+*/
+func (d *Description) String(decimals int) string {
+	var str string
+
+	str += fmt.Sprintf("count\t%d\n", d.Count)
+	str += fmt.Sprintf("mean\t%.*f\n", decimals, d.Mean)
+	str += fmt.Sprintf("std\t%.*f\n", decimals, d.Std)
+	str += fmt.Sprintf("max\t%.*f\n", decimals, d.Max)
+	str += fmt.Sprintf("min\t%.*f\n", decimals, d.Min)
+	for _, percentile := range d.DescriptionPercentiles {
+		str += fmt.Sprintf("%.2f%%\t%.*f\n", percentile.Percentile, decimals, percentile.Value)
+	}
+	str += fmt.Sprintf("NaN OK\t%t", d.AllowedNaN)
+	return str
+}
diff --git a/describe_test.go b/describe_test.go
@@ -0,0 +1,83 @@
+package stats_test
+
+import (
+	"math"
+	"testing"
+
+	"github.com/montanaflynn/stats"
+)
+
+func TestDescribeValidDataset(t *testing.T) {
+	_, err := stats.Describe([]float64{1.0, 2.0, 3.0}, false, &[]float64{25.0, 50.0, 75.0})
+	if err != nil {
+		t.Errorf("Returned an error")
+	}
+}
+
+func TestDescribeEmptyDataset(t *testing.T) {
+	_, err := stats.Describe([]float64{}, false, nil)
+	if err != stats.ErrEmptyInput {
+		t.Errorf("Did not return empty input error")
+	}
+}
+
+func TestDescribeEmptyDatasetNaN(t *testing.T) {
+	describe, err := stats.Describe([]float64{}, true, nil)
+	if err != nil {
+		t.Errorf("Returned an error")
+	}
+
+	if !math.IsNaN(describe.Max) || !math.IsNaN(describe.Mean) || !math.IsNaN(describe.Min) || !math.IsNaN(describe.Std) {
+		t.Errorf("Was not NaN")
+	}
+}
+
+func TestDescribeValidDatasetNaN(t *testing.T) {
+	describe, err := stats.Describe([]float64{1.0, 2.0, 3.0}, true, &[]float64{25.0, 50.0, 75.0})
+	if err != nil {
+		t.Errorf("Returned an error")
+	}
+
+	if math.IsNaN(describe.Max) {
+		t.Errorf("Was NaN")
+	}
+}
+
+func TestDescribeValues(t *testing.T) {
+	dataSet := []float64{1.0, 2.0, 3.0}
+	percentiles := []float64{25.0, 50.0, 75.0}
+	describe, _ := stats.Describe(dataSet, true, &percentiles)
+	if describe.Count != len(dataSet) {
+		t.Errorf("Count was not == length of dataset")
+	}
+	if len(describe.DescriptionPercentiles) != len(percentiles) {
+		t.Errorf("Percentiles length was not == length of input percentiles")
+	}
+
+	max, _ := stats.Max(dataSet)
+	if max != describe.Max {
+		t.Errorf("Max was not equal to Max(dataset)")
+	}
+
+	min, _ := stats.Min(dataSet)
+	if min != describe.Min {
+		t.Errorf("Min was not equal to Min(dataset)")
+	}
+
+	mean, _ := stats.Mean(dataSet)
+	if mean != describe.Mean {
+		t.Errorf("Mean was not equal to Mean(dataset)")
+	}
+
+	std, _ := stats.StandardDeviation(dataSet)
+	if std != describe.Std {
+		t.Errorf("Std was not equal to StandardDeviation(dataset)")
+	}
+}
+
+func TestDescribeString(t *testing.T) {
+	describe, _ := stats.Describe([]float64{1.0, 2.0, 3.0}, true, &[]float64{25.0, 50.0, 75.0})
+	if describe.String(2) != "count\t3\nmean\t2.00\nstd\t0.82\nmax\t3.00\nmin\t1.00\n25.00%\tNaN\n50.00%\t1.50\n75.00%\t2.50\nNaN OK\ttrue" {
+		t.Errorf("String output is not correct")
+	}
+}
diff --git a/distances.go b/distances.go
@@ -62,16 +62,19 @@ func ManhattanDistance(dataPointX, dataPointY Float64Data) (distance float64, er
 // MinkowskiDistance computes the Minkowski distance between two data sets
 //
 // Arguments:
-//    dataPointX: First set of data points
-//    dataPointY: Second set of data points. Length of both data
-//                sets must be equal.
-//    lambda:     aka p or city blocks; With lambda = 1
-//                returned distance is manhattan distance and
-//                lambda = 2; it is euclidean distance. Lambda
-//                reaching to infinite - distance would be chebysev
-//                distance.
+//
+//	dataPointX: First set of data points
+//	dataPointY: Second set of data points. Length of both data
+//	            sets must be equal.
+//	lambda:     aka p or city blocks; With lambda = 1
+//	            returned distance is manhattan distance and
+//	            lambda = 2; it is euclidean distance. Lambda
+//	            reaching to infinite - distance would be chebysev
+//	            distance.
+//
 // Return:
-//     Distance or error
+//
+//	Distance or error
 func MinkowskiDistance(dataPointX, dataPointY Float64Data, lambda float64) (distance float64, err error) {
 	err = validateData(dataPointX, dataPointY)
 	if err != nil {

diff --git a/examples/main.go b/examples/main.go
@@ -180,10 +180,13 @@ func main() {
 	prob1 := 0.5
 	exp, _ := stats.ExpGeom(prob1)
 	fmt.Println(exp)
-	// Output: 
-	
-	prob2:= 0.5
+	// Output:
+
+	prob2 := 0.5
 	vari, _ := stats.VarGeom(prob2)
 	fmt.Println(vari)
 	// Output: 2
+
+	description, _ := stats.Describe([]float64{1.0, 2.0, 3.0}, true, &[]float64{25.0, 50.0, 75.0})
+	fmt.Println(description.String(2))
 }