Skip to content

Commit

Permalink
ARROW-23: Add a logical Column data structure
Browse files Browse the repository at this point in the history
I also added global const instances of common primitive types

Author: Wes McKinney <wesm@apache.org>

Closes #15 from wesm/ARROW-23 and squashes the following commits:

1835d33 [Wes McKinney] Don't use auto
988135c [Wes McKinney] Add Column chunk type validation function
8a2e40e [Wes McKinney] Remove unneeded operator()/shared_from_this experiment
de9ec70 [Wes McKinney] Aggregate null counts too
7049314 [Wes McKinney] cpplint
a565d26 [Wes McKinney] Add ChunkedArray / Column ctors, test passes
0648ed2 [Wes McKinney] Prototyping
  • Loading branch information
wesm committed Mar 4, 2016
1 parent 3b777c7 commit 9c2b954
Show file tree
Hide file tree
Showing 12 changed files with 347 additions and 15 deletions.
2 changes: 2 additions & 0 deletions cpp/CMakeLists.txt
Expand Up @@ -458,10 +458,12 @@ endif()

add_subdirectory(src/arrow)
add_subdirectory(src/arrow/util)
add_subdirectory(src/arrow/table)
add_subdirectory(src/arrow/types)

set(LINK_LIBS
arrow_util
arrow_table
arrow_types)

set(ARROW_SRCS
Expand Down
1 change: 0 additions & 1 deletion cpp/src/arrow/array.h
Expand Up @@ -81,7 +81,6 @@ class Array {
DISALLOW_COPY_AND_ASSIGN(Array);
};


typedef std::shared_ptr<Array> ArrayPtr;

} // namespace arrow
Expand Down
7 changes: 3 additions & 4 deletions cpp/src/arrow/schema-test.cc
Expand Up @@ -31,7 +31,7 @@ using std::vector;
namespace arrow {

TEST(TestField, Basics) {
shared_ptr<DataType> ftype = std::make_shared<Int32Type>();
shared_ptr<DataType> ftype = INT32;
shared_ptr<DataType> ftype_nn = std::make_shared<Int32Type>(false);
Field f0("f0", ftype);
Field f0_nn("f0", ftype_nn);
Expand All @@ -44,7 +44,7 @@ TEST(TestField, Basics) {
}

TEST(TestField, Equals) {
shared_ptr<DataType> ftype = std::make_shared<Int32Type>();
shared_ptr<DataType> ftype = INT32;
shared_ptr<DataType> ftype_nn = std::make_shared<Int32Type>(false);

Field f0("f0", ftype);
Expand All @@ -61,8 +61,7 @@ class TestSchema : public ::testing::Test {
};

TEST_F(TestSchema, Basics) {
auto f0 = std::make_shared<Field>("f0", std::make_shared<Int32Type>());

auto f0 = std::make_shared<Field>("f0", INT32);
auto f1 = std::make_shared<Field>("f1", std::make_shared<UInt8Type>(false));
auto f1_optional = std::make_shared<Field>("f1", std::make_shared<UInt8Type>());

Expand Down
39 changes: 39 additions & 0 deletions cpp/src/arrow/table/CMakeLists.txt
@@ -0,0 +1,39 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

#######################################
# arrow_table
#######################################

set(TABLE_SRCS
column.cc
)

set(TABLE_LIBS
)

add_library(arrow_table STATIC
${TABLE_SRCS}
)
target_link_libraries(arrow_table ${TABLE_LIBS})
SET_TARGET_PROPERTIES(arrow_table PROPERTIES LINKER_LANGUAGE CXX)

# Headers: top level
install(FILES
DESTINATION include/arrow/table)

ADD_ARROW_TEST(column-test)
93 changes: 93 additions & 0 deletions cpp/src/arrow/table/column-test.cc
@@ -0,0 +1,93 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include <gtest/gtest.h>
#include <cstdint>
#include <memory>
#include <string>
#include <vector>

#include "arrow/field.h"
#include "arrow/schema.h"
#include "arrow/table/column.h"
#include "arrow/test-util.h"
#include "arrow/type.h"
#include "arrow/types/integer.h"
#include "arrow/util/bit-util.h"
#include "arrow/util/buffer.h"
#include "arrow/util/memory-pool.h"
#include "arrow/util/status.h"

using std::shared_ptr;
using std::vector;

namespace arrow {

class TestColumn : public ::testing::Test {
public:
void SetUp() {
pool_ = GetDefaultMemoryPool();
}

template <typename ArrayType>
std::shared_ptr<Array> MakeArray(int32_t length, int32_t null_count = 0) {
auto data = std::make_shared<PoolBuffer>(pool_);
auto nulls = std::make_shared<PoolBuffer>(pool_);
data->Resize(length * sizeof(typename ArrayType::value_type));
nulls->Resize(util::bytes_for_bits(length));
return std::make_shared<ArrayType>(length, data, 10, nulls);
}

protected:
MemoryPool* pool_;

std::shared_ptr<ChunkedArray> data_;
std::unique_ptr<Column> column_;
};

TEST_F(TestColumn, BasicAPI) {
ArrayVector arrays;
arrays.push_back(MakeArray<Int32Array>(100));
arrays.push_back(MakeArray<Int32Array>(100, 10));
arrays.push_back(MakeArray<Int32Array>(100, 20));

auto field = std::make_shared<Field>("c0", INT32);
column_.reset(new Column(field, arrays));

ASSERT_EQ("c0", column_->name());
ASSERT_TRUE(column_->type()->Equals(INT32));
ASSERT_EQ(300, column_->length());
ASSERT_EQ(30, column_->null_count());
ASSERT_EQ(3, column_->data()->num_chunks());
}

TEST_F(TestColumn, ChunksInhomogeneous) {
ArrayVector arrays;
arrays.push_back(MakeArray<Int32Array>(100));
arrays.push_back(MakeArray<Int32Array>(100, 10));

auto field = std::make_shared<Field>("c0", INT32);
column_.reset(new Column(field, arrays));

ASSERT_OK(column_->ValidateData());

arrays.push_back(MakeArray<Int16Array>(100, 10));
column_.reset(new Column(field, arrays));
ASSERT_RAISES(Invalid, column_->ValidateData());
}

} // namespace arrow
62 changes: 62 additions & 0 deletions cpp/src/arrow/table/column.cc
@@ -0,0 +1,62 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "arrow/table/column.h"

#include <memory>
#include <sstream>

#include "arrow/field.h"
#include "arrow/util/status.h"

namespace arrow {

ChunkedArray::ChunkedArray(const ArrayVector& chunks) :
chunks_(chunks) {
length_ = 0;
for (const std::shared_ptr<Array>& chunk : chunks) {
length_ += chunk->length();
null_count_ += chunk->null_count();
}
}

Column::Column(const std::shared_ptr<Field>& field, const ArrayVector& chunks) :
field_(field) {
data_ = std::make_shared<ChunkedArray>(chunks);
}

Column::Column(const std::shared_ptr<Field>& field,
const std::shared_ptr<ChunkedArray>& data) :
field_(field),
data_(data) {}

Status Column::ValidateData() {
for (int i = 0; i < data_->num_chunks(); ++i) {
const std::shared_ptr<DataType>& type = data_->chunk(i)->type();
if (!this->type()->Equals(type)) {
std::stringstream ss;
ss << "In chunk " << i << " expected type "
<< this->type()->ToString()
<< " but saw "
<< type->ToString();
return Status::Invalid(ss.str());
}
}
return Status::OK();
}

} // namespace arrow
103 changes: 103 additions & 0 deletions cpp/src/arrow/table/column.h
@@ -0,0 +1,103 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#ifndef ARROW_TABLE_COLUMN_H
#define ARROW_TABLE_COLUMN_H

#include <memory>
#include <string>
#include <vector>

#include "arrow/array.h"
#include "arrow/field.h"

namespace arrow {

typedef std::vector<std::shared_ptr<Array> > ArrayVector;

// A data structure managing a list of primitive Arrow arrays logically as one
// large array
class ChunkedArray {
public:
explicit ChunkedArray(const ArrayVector& chunks);

// @returns: the total length of the chunked array; computed on construction
int64_t length() const {
return length_;
}

int64_t null_count() const {
return null_count_;
}

int num_chunks() const {
return chunks_.size();
}

const std::shared_ptr<Array>& chunk(int i) const {
return chunks_[i];
}

protected:
ArrayVector chunks_;
int64_t length_;
int64_t null_count_;
};

// An immutable column data structure consisting of a field (type metadata) and
// a logical chunked data array (which can be validated as all being the same
// type).
class Column {
public:
Column(const std::shared_ptr<Field>& field, const ArrayVector& chunks);
Column(const std::shared_ptr<Field>& field,
const std::shared_ptr<ChunkedArray>& data);

int64_t length() const {
return data_->length();
}

int64_t null_count() const {
return data_->null_count();
}

// @returns: the column's name in the passed metadata
const std::string& name() const {
return field_->name;
}

// @returns: the column's type according to the metadata
const std::shared_ptr<DataType>& type() const {
return field_->type;
}

// @returns: the column's data as a chunked logical array
const std::shared_ptr<ChunkedArray>& data() const {
return data_;
}
// Verify that the column's array data is consistent with the passed field's
// metadata
Status ValidateData();

protected:
std::shared_ptr<Field> field_;
std::shared_ptr<ChunkedArray> data_;
};

} // namespace arrow

#endif // ARROW_TABLE_COLUMN_H
12 changes: 12 additions & 0 deletions cpp/src/arrow/type.cc
Expand Up @@ -19,4 +19,16 @@

namespace arrow {

const std::shared_ptr<BooleanType> BOOL = std::make_shared<BooleanType>();
const std::shared_ptr<UInt8Type> UINT8 = std::make_shared<UInt8Type>();
const std::shared_ptr<UInt16Type> UINT16 = std::make_shared<UInt16Type>();
const std::shared_ptr<UInt32Type> UINT32 = std::make_shared<UInt32Type>();
const std::shared_ptr<UInt64Type> UINT64 = std::make_shared<UInt64Type>();
const std::shared_ptr<Int8Type> INT8 = std::make_shared<Int8Type>();
const std::shared_ptr<Int16Type> INT16 = std::make_shared<Int16Type>();
const std::shared_ptr<Int32Type> INT32 = std::make_shared<Int32Type>();
const std::shared_ptr<Int64Type> INT64 = std::make_shared<Int64Type>();
const std::shared_ptr<FloatType> FLOAT = std::make_shared<FloatType>();
const std::shared_ptr<DoubleType> DOUBLE = std::make_shared<DoubleType>();

} // namespace arrow

0 comments on commit 9c2b954

Please sign in to comment.