Skip to content

Commit

Permalink
Add describe functions (#77)
Browse files Browse the repository at this point in the history
Closes issue #65

* added describe feature (pandas.describe)

* Update README.md

---------

Co-authored-by: Montana Flynn <montana949@gmail.com>
  • Loading branch information
nurjeff and montanaflynn committed May 11, 2023
1 parent a145605 commit b9dad85
Show file tree
Hide file tree
Showing 5 changed files with 184 additions and 12 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ func Correlation(data1, data2 Float64Data) (float64, error) {}
func Covariance(data1, data2 Float64Data) (float64, error) {}
func CovariancePopulation(data1, data2 Float64Data) (float64, error) {}
func CumulativeSum(input Float64Data) ([]float64, error) {}
func Describe(input Float64Data, allowNaN bool, percentiles *[]float64) (*Description, error) {}
func DescribePercentileFunc(input Float64Data, allowNaN bool, percentiles *[]float64, percentileFunc func(Float64Data, float64) (float64, error)) (*Description, error) {}
func Entropy(input Float64Data) (float64, error) {}
func EuclideanDistance(dataPointX, dataPointY Float64Data) (distance float64, err error) {}
func GeometricMean(input Float64Data) (float64, error) {}
Expand Down
81 changes: 81 additions & 0 deletions describe.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
package stats

import "fmt"

// Holds information about the dataset provided to Describe
type Description struct {
Count int
Mean float64
Std float64
Max float64
Min float64
DescriptionPercentiles []descriptionPercentile
AllowedNaN bool
}

// Specifies percentiles to be computed
type descriptionPercentile struct {
Percentile float64
Value float64
}

// Describe generates descriptive statistics about a provided dataset, similar to python's pandas.describe()
func Describe(input Float64Data, allowNaN bool, percentiles *[]float64) (*Description, error) {
return DescribePercentileFunc(input, allowNaN, percentiles, Percentile)
}

// Describe generates descriptive statistics about a provided dataset, similar to python's pandas.describe()
// Takes in a function to use for percentile calculation
func DescribePercentileFunc(input Float64Data, allowNaN bool, percentiles *[]float64, percentileFunc func(Float64Data, float64) (float64, error)) (*Description, error) {
var description Description
description.AllowedNaN = allowNaN
description.Count = input.Len()

if description.Count == 0 && !allowNaN {
return &description, ErrEmptyInput
}

// Disregard error, since it cannot be thrown if Count is > 0 and allowNaN is false, else NaN is accepted
description.Std, _ = StandardDeviation(input)
description.Max, _ = Max(input)
description.Min, _ = Min(input)
description.Mean, _ = Mean(input)

if percentiles != nil {
for _, percentile := range *percentiles {
if value, err := percentileFunc(input, percentile); err == nil || allowNaN {
description.DescriptionPercentiles = append(description.DescriptionPercentiles, descriptionPercentile{Percentile: percentile, Value: value})
}
}
}

return &description, nil
}

/*
Represents the Description instance in a string format with specified number of decimals
count 3
mean 2.00
std 0.82
max 3.00
min 1.00
25.00% NaN
50.00% 1.50
75.00% 2.50
NaN OK true
*/
func (d *Description) String(decimals int) string {
var str string

str += fmt.Sprintf("count\t%d\n", d.Count)
str += fmt.Sprintf("mean\t%.*f\n", decimals, d.Mean)
str += fmt.Sprintf("std\t%.*f\n", decimals, d.Std)
str += fmt.Sprintf("max\t%.*f\n", decimals, d.Max)
str += fmt.Sprintf("min\t%.*f\n", decimals, d.Min)
for _, percentile := range d.DescriptionPercentiles {
str += fmt.Sprintf("%.2f%%\t%.*f\n", percentile.Percentile, decimals, percentile.Value)
}
str += fmt.Sprintf("NaN OK\t%t", d.AllowedNaN)
return str
}
83 changes: 83 additions & 0 deletions describe_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
package stats_test

import (
"math"
"testing"

"github.com/montanaflynn/stats"
)

func TestDescribeValidDataset(t *testing.T) {
_, err := stats.Describe([]float64{1.0, 2.0, 3.0}, false, &[]float64{25.0, 50.0, 75.0})
if err != nil {
t.Errorf("Returned an error")
}
}

func TestDescribeEmptyDataset(t *testing.T) {
_, err := stats.Describe([]float64{}, false, nil)
if err != stats.ErrEmptyInput {
t.Errorf("Did not return empty input error")
}
}

func TestDescribeEmptyDatasetNaN(t *testing.T) {
describe, err := stats.Describe([]float64{}, true, nil)
if err != nil {
t.Errorf("Returned an error")
}

if !math.IsNaN(describe.Max) || !math.IsNaN(describe.Mean) || !math.IsNaN(describe.Min) || !math.IsNaN(describe.Std) {
t.Errorf("Was not NaN")
}
}

func TestDescribeValidDatasetNaN(t *testing.T) {
describe, err := stats.Describe([]float64{1.0, 2.0, 3.0}, true, &[]float64{25.0, 50.0, 75.0})
if err != nil {
t.Errorf("Returned an error")
}

if math.IsNaN(describe.Max) {
t.Errorf("Was NaN")
}
}

func TestDescribeValues(t *testing.T) {
dataSet := []float64{1.0, 2.0, 3.0}
percentiles := []float64{25.0, 50.0, 75.0}
describe, _ := stats.Describe(dataSet, true, &percentiles)
if describe.Count != len(dataSet) {
t.Errorf("Count was not == length of dataset")
}
if len(describe.DescriptionPercentiles) != len(percentiles) {
t.Errorf("Percentiles length was not == length of input percentiles")
}

max, _ := stats.Max(dataSet)
if max != describe.Max {
t.Errorf("Max was not equal to Max(dataset)")
}

min, _ := stats.Min(dataSet)
if min != describe.Min {
t.Errorf("Min was not equal to Min(dataset)")
}

mean, _ := stats.Mean(dataSet)
if mean != describe.Mean {
t.Errorf("Mean was not equal to Mean(dataset)")
}

std, _ := stats.StandardDeviation(dataSet)
if std != describe.Std {
t.Errorf("Std was not equal to StandardDeviation(dataset)")
}
}

func TestDescribeString(t *testing.T) {
describe, _ := stats.Describe([]float64{1.0, 2.0, 3.0}, true, &[]float64{25.0, 50.0, 75.0})
if describe.String(2) != "count\t3\nmean\t2.00\nstd\t0.82\nmax\t3.00\nmin\t1.00\n25.00%\tNaN\n50.00%\t1.50\n75.00%\t2.50\nNaN OK\ttrue" {
t.Errorf("String output is not correct")
}
}
21 changes: 12 additions & 9 deletions distances.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,16 +62,19 @@ func ManhattanDistance(dataPointX, dataPointY Float64Data) (distance float64, er
// MinkowskiDistance computes the Minkowski distance between two data sets
//
// Arguments:
// dataPointX: First set of data points
// dataPointY: Second set of data points. Length of both data
// sets must be equal.
// lambda: aka p or city blocks; With lambda = 1
// returned distance is manhattan distance and
// lambda = 2; it is euclidean distance. Lambda
// reaching to infinite - distance would be chebysev
// distance.
//
// dataPointX: First set of data points
// dataPointY: Second set of data points. Length of both data
// sets must be equal.
// lambda: aka p or city blocks; With lambda = 1
// returned distance is manhattan distance and
// lambda = 2; it is euclidean distance. Lambda
// reaching to infinite - distance would be chebysev
// distance.
//
// Return:
// Distance or error
//
// Distance or error
func MinkowskiDistance(dataPointX, dataPointY Float64Data, lambda float64) (distance float64, err error) {
err = validateData(dataPointX, dataPointY)
if err != nil {
Expand Down
9 changes: 6 additions & 3 deletions examples/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -180,10 +180,13 @@ func main() {
prob1 := 0.5
exp, _ := stats.ExpGeom(prob1)
fmt.Println(exp)
// Output:
prob2:= 0.5
// Output:

prob2 := 0.5
vari, _ := stats.VarGeom(prob2)
fmt.Println(vari)
// Output: 2

description, _ := stats.Describe([]float64{1.0, 2.0, 3.0}, true, &[]float64{25.0, 50.0, 75.0})
fmt.Println(description.String(2))
}

0 comments on commit b9dad85

Please sign in to comment.