Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added describe feature #77

Merged
merged 2 commits into from
May 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ func Correlation(data1, data2 Float64Data) (float64, error) {}
func Covariance(data1, data2 Float64Data) (float64, error) {}
func CovariancePopulation(data1, data2 Float64Data) (float64, error) {}
func CumulativeSum(input Float64Data) ([]float64, error) {}
func Describe(input Float64Data, allowNaN bool, percentiles *[]float64) (*Description, error) {}
func DescribePercentileFunc(input Float64Data, allowNaN bool, percentiles *[]float64, percentileFunc func(Float64Data, float64) (float64, error)) (*Description, error) {}
func Entropy(input Float64Data) (float64, error) {}
func EuclideanDistance(dataPointX, dataPointY Float64Data) (distance float64, err error) {}
func GeometricMean(input Float64Data) (float64, error) {}
Expand Down
81 changes: 81 additions & 0 deletions describe.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
package stats

import "fmt"

// Holds information about the dataset provided to Describe
type Description struct {
Count int
Mean float64
Std float64
Max float64
Min float64
DescriptionPercentiles []descriptionPercentile
AllowedNaN bool
}

// Specifies percentiles to be computed
type descriptionPercentile struct {
Percentile float64
Value float64
}

// Describe generates descriptive statistics about a provided dataset, similar to python's pandas.describe()
func Describe(input Float64Data, allowNaN bool, percentiles *[]float64) (*Description, error) {
return DescribePercentileFunc(input, allowNaN, percentiles, Percentile)
}

// Describe generates descriptive statistics about a provided dataset, similar to python's pandas.describe()
// Takes in a function to use for percentile calculation
func DescribePercentileFunc(input Float64Data, allowNaN bool, percentiles *[]float64, percentileFunc func(Float64Data, float64) (float64, error)) (*Description, error) {
var description Description
description.AllowedNaN = allowNaN
description.Count = input.Len()

if description.Count == 0 && !allowNaN {
return &description, ErrEmptyInput
}

// Disregard error, since it cannot be thrown if Count is > 0 and allowNaN is false, else NaN is accepted
description.Std, _ = StandardDeviation(input)
description.Max, _ = Max(input)
description.Min, _ = Min(input)
description.Mean, _ = Mean(input)

if percentiles != nil {
for _, percentile := range *percentiles {
if value, err := percentileFunc(input, percentile); err == nil || allowNaN {
description.DescriptionPercentiles = append(description.DescriptionPercentiles, descriptionPercentile{Percentile: percentile, Value: value})
}
}
}

return &description, nil
}

/*
Represents the Description instance in a string format with specified number of decimals

count 3
mean 2.00
std 0.82
max 3.00
min 1.00
25.00% NaN
50.00% 1.50
75.00% 2.50
NaN OK true
*/
func (d *Description) String(decimals int) string {
var str string

str += fmt.Sprintf("count\t%d\n", d.Count)
str += fmt.Sprintf("mean\t%.*f\n", decimals, d.Mean)
str += fmt.Sprintf("std\t%.*f\n", decimals, d.Std)
str += fmt.Sprintf("max\t%.*f\n", decimals, d.Max)
str += fmt.Sprintf("min\t%.*f\n", decimals, d.Min)
for _, percentile := range d.DescriptionPercentiles {
str += fmt.Sprintf("%.2f%%\t%.*f\n", percentile.Percentile, decimals, percentile.Value)
}
str += fmt.Sprintf("NaN OK\t%t", d.AllowedNaN)
return str
}
83 changes: 83 additions & 0 deletions describe_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
package stats_test

import (
"math"
"testing"

"github.com/montanaflynn/stats"
)

func TestDescribeValidDataset(t *testing.T) {
_, err := stats.Describe([]float64{1.0, 2.0, 3.0}, false, &[]float64{25.0, 50.0, 75.0})
if err != nil {
t.Errorf("Returned an error")
}
}

func TestDescribeEmptyDataset(t *testing.T) {
_, err := stats.Describe([]float64{}, false, nil)
if err != stats.ErrEmptyInput {
t.Errorf("Did not return empty input error")
}
}

func TestDescribeEmptyDatasetNaN(t *testing.T) {
describe, err := stats.Describe([]float64{}, true, nil)
if err != nil {
t.Errorf("Returned an error")
}

if !math.IsNaN(describe.Max) || !math.IsNaN(describe.Mean) || !math.IsNaN(describe.Min) || !math.IsNaN(describe.Std) {
t.Errorf("Was not NaN")
}
}

func TestDescribeValidDatasetNaN(t *testing.T) {
describe, err := stats.Describe([]float64{1.0, 2.0, 3.0}, true, &[]float64{25.0, 50.0, 75.0})
if err != nil {
t.Errorf("Returned an error")
}

if math.IsNaN(describe.Max) {
t.Errorf("Was NaN")
}
}

func TestDescribeValues(t *testing.T) {
dataSet := []float64{1.0, 2.0, 3.0}
percentiles := []float64{25.0, 50.0, 75.0}
describe, _ := stats.Describe(dataSet, true, &percentiles)
if describe.Count != len(dataSet) {
t.Errorf("Count was not == length of dataset")
}
if len(describe.DescriptionPercentiles) != len(percentiles) {
t.Errorf("Percentiles length was not == length of input percentiles")
}

max, _ := stats.Max(dataSet)
if max != describe.Max {
t.Errorf("Max was not equal to Max(dataset)")
}

min, _ := stats.Min(dataSet)
if min != describe.Min {
t.Errorf("Min was not equal to Min(dataset)")
}

mean, _ := stats.Mean(dataSet)
if mean != describe.Mean {
t.Errorf("Mean was not equal to Mean(dataset)")
}

std, _ := stats.StandardDeviation(dataSet)
if std != describe.Std {
t.Errorf("Std was not equal to StandardDeviation(dataset)")
}
}

func TestDescribeString(t *testing.T) {
describe, _ := stats.Describe([]float64{1.0, 2.0, 3.0}, true, &[]float64{25.0, 50.0, 75.0})
if describe.String(2) != "count\t3\nmean\t2.00\nstd\t0.82\nmax\t3.00\nmin\t1.00\n25.00%\tNaN\n50.00%\t1.50\n75.00%\t2.50\nNaN OK\ttrue" {
t.Errorf("String output is not correct")
}
}
21 changes: 12 additions & 9 deletions distances.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,16 +62,19 @@ func ManhattanDistance(dataPointX, dataPointY Float64Data) (distance float64, er
// MinkowskiDistance computes the Minkowski distance between two data sets
//
// Arguments:
// dataPointX: First set of data points
// dataPointY: Second set of data points. Length of both data
// sets must be equal.
// lambda: aka p or city blocks; With lambda = 1
// returned distance is manhattan distance and
// lambda = 2; it is euclidean distance. Lambda
// reaching to infinite - distance would be chebysev
// distance.
//
// dataPointX: First set of data points
// dataPointY: Second set of data points. Length of both data
// sets must be equal.
// lambda: aka p or city blocks; With lambda = 1
// returned distance is manhattan distance and
// lambda = 2; it is euclidean distance. Lambda
// reaching to infinite - distance would be chebysev
// distance.
//
// Return:
// Distance or error
//
// Distance or error
func MinkowskiDistance(dataPointX, dataPointY Float64Data, lambda float64) (distance float64, err error) {
err = validateData(dataPointX, dataPointY)
if err != nil {
Expand Down
9 changes: 6 additions & 3 deletions examples/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -180,10 +180,13 @@ func main() {
prob1 := 0.5
exp, _ := stats.ExpGeom(prob1)
fmt.Println(exp)
// Output:
prob2:= 0.5
// Output:

prob2 := 0.5
vari, _ := stats.VarGeom(prob2)
fmt.Println(vari)
// Output: 2

description, _ := stats.Describe([]float64{1.0, 2.0, 3.0}, true, &[]float64{25.0, 50.0, 75.0})
fmt.Println(description.String(2))
}