Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARROW-23: Add a logical Column data structure #15

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions cpp/CMakeLists.txt
Expand Up @@ -458,10 +458,12 @@ endif()

add_subdirectory(src/arrow)
add_subdirectory(src/arrow/util)
add_subdirectory(src/arrow/table)
add_subdirectory(src/arrow/types)

set(LINK_LIBS
arrow_util
arrow_table
arrow_types)

set(ARROW_SRCS
Expand Down
1 change: 0 additions & 1 deletion cpp/src/arrow/array.h
Expand Up @@ -81,7 +81,6 @@ class Array {
DISALLOW_COPY_AND_ASSIGN(Array);
};


typedef std::shared_ptr<Array> ArrayPtr;

} // namespace arrow
Expand Down
7 changes: 3 additions & 4 deletions cpp/src/arrow/schema-test.cc
Expand Up @@ -31,7 +31,7 @@ using std::vector;
namespace arrow {

TEST(TestField, Basics) {
shared_ptr<DataType> ftype = std::make_shared<Int32Type>();
shared_ptr<DataType> ftype = INT32;
shared_ptr<DataType> ftype_nn = std::make_shared<Int32Type>(false);
Field f0("f0", ftype);
Field f0_nn("f0", ftype_nn);
Expand All @@ -44,7 +44,7 @@ TEST(TestField, Basics) {
}

TEST(TestField, Equals) {
shared_ptr<DataType> ftype = std::make_shared<Int32Type>();
shared_ptr<DataType> ftype = INT32;
shared_ptr<DataType> ftype_nn = std::make_shared<Int32Type>(false);

Field f0("f0", ftype);
Expand All @@ -61,8 +61,7 @@ class TestSchema : public ::testing::Test {
};

TEST_F(TestSchema, Basics) {
auto f0 = std::make_shared<Field>("f0", std::make_shared<Int32Type>());

auto f0 = std::make_shared<Field>("f0", INT32);
auto f1 = std::make_shared<Field>("f1", std::make_shared<UInt8Type>(false));
auto f1_optional = std::make_shared<Field>("f1", std::make_shared<UInt8Type>());

Expand Down
39 changes: 39 additions & 0 deletions cpp/src/arrow/table/CMakeLists.txt
@@ -0,0 +1,39 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

#######################################
# arrow_table
#######################################

set(TABLE_SRCS
column.cc
)

set(TABLE_LIBS
)

add_library(arrow_table STATIC
${TABLE_SRCS}
)
target_link_libraries(arrow_table ${TABLE_LIBS})
SET_TARGET_PROPERTIES(arrow_table PROPERTIES LINKER_LANGUAGE CXX)

# Headers: top level
install(FILES
DESTINATION include/arrow/table)

ADD_ARROW_TEST(column-test)
93 changes: 93 additions & 0 deletions cpp/src/arrow/table/column-test.cc
@@ -0,0 +1,93 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include <gtest/gtest.h>
#include <cstdint>
#include <memory>
#include <string>
#include <vector>

#include "arrow/field.h"
#include "arrow/schema.h"
#include "arrow/table/column.h"
#include "arrow/test-util.h"
#include "arrow/type.h"
#include "arrow/types/integer.h"
#include "arrow/util/bit-util.h"
#include "arrow/util/buffer.h"
#include "arrow/util/memory-pool.h"
#include "arrow/util/status.h"

using std::shared_ptr;
using std::vector;

namespace arrow {

class TestColumn : public ::testing::Test {
public:
void SetUp() {
pool_ = GetDefaultMemoryPool();
}

template <typename ArrayType>
std::shared_ptr<Array> MakeArray(int32_t length, int32_t null_count = 0) {
auto data = std::make_shared<PoolBuffer>(pool_);
auto nulls = std::make_shared<PoolBuffer>(pool_);
data->Resize(length * sizeof(typename ArrayType::value_type));
nulls->Resize(util::bytes_for_bits(length));
return std::make_shared<ArrayType>(length, data, 10, nulls);
}

protected:
MemoryPool* pool_;

std::shared_ptr<ChunkedArray> data_;
std::unique_ptr<Column> column_;
};

TEST_F(TestColumn, BasicAPI) {
ArrayVector arrays;
arrays.push_back(MakeArray<Int32Array>(100));
arrays.push_back(MakeArray<Int32Array>(100, 10));
arrays.push_back(MakeArray<Int32Array>(100, 20));

auto field = std::make_shared<Field>("c0", INT32);
column_.reset(new Column(field, arrays));

ASSERT_EQ("c0", column_->name());
ASSERT_TRUE(column_->type()->Equals(INT32));
ASSERT_EQ(300, column_->length());
ASSERT_EQ(30, column_->null_count());
ASSERT_EQ(3, column_->data()->num_chunks());
}

TEST_F(TestColumn, ChunksInhomogeneous) {
ArrayVector arrays;
arrays.push_back(MakeArray<Int32Array>(100));
arrays.push_back(MakeArray<Int32Array>(100, 10));

auto field = std::make_shared<Field>("c0", INT32);
column_.reset(new Column(field, arrays));

ASSERT_OK(column_->ValidateData());

arrays.push_back(MakeArray<Int16Array>(100, 10));
column_.reset(new Column(field, arrays));
ASSERT_RAISES(Invalid, column_->ValidateData());
}

} // namespace arrow
62 changes: 62 additions & 0 deletions cpp/src/arrow/table/column.cc
@@ -0,0 +1,62 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "arrow/table/column.h"

#include <memory>
#include <sstream>

#include "arrow/field.h"
#include "arrow/util/status.h"

namespace arrow {

ChunkedArray::ChunkedArray(const ArrayVector& chunks) :
chunks_(chunks) {
length_ = 0;
for (const std::shared_ptr<Array>& chunk : chunks) {
length_ += chunk->length();
null_count_ += chunk->null_count();
}
}

Column::Column(const std::shared_ptr<Field>& field, const ArrayVector& chunks) :
field_(field) {
data_ = std::make_shared<ChunkedArray>(chunks);
}

Column::Column(const std::shared_ptr<Field>& field,
const std::shared_ptr<ChunkedArray>& data) :
field_(field),
data_(data) {}

Status Column::ValidateData() {
for (int i = 0; i < data_->num_chunks(); ++i) {
const std::shared_ptr<DataType>& type = data_->chunk(i)->type();
if (!this->type()->Equals(type)) {
std::stringstream ss;
ss << "In chunk " << i << " expected type "
<< this->type()->ToString()
<< " but saw "
<< type->ToString();
return Status::Invalid(ss.str());
}
}
return Status::OK();
}

} // namespace arrow
103 changes: 103 additions & 0 deletions cpp/src/arrow/table/column.h
@@ -0,0 +1,103 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#ifndef ARROW_TABLE_COLUMN_H
#define ARROW_TABLE_COLUMN_H

#include <memory>
#include <string>
#include <vector>

#include "arrow/array.h"
#include "arrow/field.h"

namespace arrow {

typedef std::vector<std::shared_ptr<Array> > ArrayVector;

// A data structure managing a list of primitive Arrow arrays logically as one
// large array
class ChunkedArray {
public:
explicit ChunkedArray(const ArrayVector& chunks);

// @returns: the total length of the chunked array; computed on construction
int64_t length() const {
return length_;
}

int64_t null_count() const {
return null_count_;
}

int num_chunks() const {
return chunks_.size();
}

const std::shared_ptr<Array>& chunk(int i) const {
return chunks_[i];
}

protected:
ArrayVector chunks_;
int64_t length_;
int64_t null_count_;
};

// An immutable column data structure consisting of a field (type metadata) and
// a logical chunked data array (which can be validated as all being the same
// type).
class Column {
public:
Column(const std::shared_ptr<Field>& field, const ArrayVector& chunks);
Column(const std::shared_ptr<Field>& field,
const std::shared_ptr<ChunkedArray>& data);

int64_t length() const {
return data_->length();
}

int64_t null_count() const {
return data_->null_count();
}

// @returns: the column's name in the passed metadata
const std::string& name() const {
return field_->name;
}

// @returns: the column's type according to the metadata
const std::shared_ptr<DataType>& type() const {
return field_->type;
}

// @returns: the column's data as a chunked logical array
const std::shared_ptr<ChunkedArray>& data() const {
return data_;
}
// Verify that the column's array data is consistent with the passed field's
// metadata
Status ValidateData();

protected:
std::shared_ptr<Field> field_;
std::shared_ptr<ChunkedArray> data_;
};

} // namespace arrow

#endif // ARROW_TABLE_COLUMN_H
12 changes: 12 additions & 0 deletions cpp/src/arrow/type.cc
Expand Up @@ -19,4 +19,16 @@

namespace arrow {

const std::shared_ptr<BooleanType> BOOL = std::make_shared<BooleanType>();
const std::shared_ptr<UInt8Type> UINT8 = std::make_shared<UInt8Type>();
const std::shared_ptr<UInt16Type> UINT16 = std::make_shared<UInt16Type>();
const std::shared_ptr<UInt32Type> UINT32 = std::make_shared<UInt32Type>();
const std::shared_ptr<UInt64Type> UINT64 = std::make_shared<UInt64Type>();
const std::shared_ptr<Int8Type> INT8 = std::make_shared<Int8Type>();
const std::shared_ptr<Int16Type> INT16 = std::make_shared<Int16Type>();
const std::shared_ptr<Int32Type> INT32 = std::make_shared<Int32Type>();
const std::shared_ptr<Int64Type> INT64 = std::make_shared<Int64Type>();
const std::shared_ptr<FloatType> FLOAT = std::make_shared<FloatType>();
const std::shared_ptr<DoubleType> DOUBLE = std::make_shared<DoubleType>();

} // namespace arrow