Skip to content

Commit

Permalink
ARROW-3966 [Java] JDBC Column Metadata in Arrow Field Metadata
Browse files Browse the repository at this point in the history
https://issues.apache.org/jira/browse/ARROW-3966

This change includes #3133, and supports a new configuration item called "Include Metadata."  If true, metadata from the JDBC ResultSetMetaData object is pulled along to the Schema Field Metadata.  For now, this includes:
* Catalog Name
* Table Name
* Column Name
* Column Type Name

Author: Mike Pigott <mpigott@gmail.com>
Author: Michael Pigott <mikepigott@users.noreply.github.com>

Closes #3134 from mikepigott/jdbc-column-metadata and squashes the following commits:

02f2f34 <Mike Pigott> ARROW-3966: Picking up lost change to support null calendars.
7049c36 <Mike Pigott> Merge branch 'master' into jdbc-column-metadata
e9a9b2b <Michael Pigott> Merge pull request #6 from apache/master
65741a9 <Mike Pigott> ARROW-3966: Code review feedback
cc6cc88 <Mike Pigott> ARROW-3966: Using a 1:N loop instead of a 0:N-1 loop for fewer index offsets in code.
cfb2ba6 <Mike Pigott> ARROW-3966: Using a helper method for building a UTC calendar with root locale.
2928513 <Mike Pigott> ARROW-3966: Moving the metadata flag assignment into the builder.
69022c2 <Mike Pigott> ARROW-3966: Fixing merge.
4a6de86 <Mike Pigott> Merge branch 'master' into jdbc-column-metadata
509a1cc <Michael Pigott> Merge pull request #5 from apache/master
789c8c8 <Michael Pigott> Merge pull request #4 from apache/master
e5b19ee <Michael Pigott> Merge pull request #3 from apache/master
3b17c29 <Michael Pigott> Merge pull request #2 from apache/master
d847ebc <Mike Pigott> Fixing file location
1ceac9e <Mike Pigott> Merge branch 'master' into jdbc-column-metadata
881c6c8 <Michael Pigott> Merge pull request #1 from apache/master
03091a8 <Mike Pigott> Unit tests for including result set metadata.
72d64cc <Mike Pigott> Affirming the field metadata is empty when the configuration excludes field metadata.
7b4527c <Mike Pigott> Test for the include-metadata flag in the configuration.
7e9ce37 <Mike Pigott> Merge branch 'jdbc-to-arrow-config' into jdbc-column-metadata
bb3165b <Mike Pigott> Updating the function calls to use the JdbcToArrowConfig versions.
a6fb1be <Mike Pigott> Fixing function call
5bfd6a2 <Mike Pigott> Merge branch 'jdbc-to-arrow-config' into jdbc-column-metadata
68c91e7 <Mike Pigott> Modifying the jdbcToArrowSchema and jdbcToArrowVectors methods to receive JdbcToArrowConfig objects.
b5b0cb1 <Mike Pigott> Merge branch 'jdbc-to-arrow-config' into jdbc-column-metadata
8d6cf00 <Mike Pigott> Documentation for public static VectorSchemaRoot sqlToArrow(Connection connection, String query, JdbcToArrowConfig config)
4f1260c <Mike Pigott> Adding documentation for public static VectorSchemaRoot sqlToArrow(ResultSet resultSet, JdbcToArrowConfig config)
e34a9e7 <Mike Pigott> Fixing formatting.
fe097c8 <Mike Pigott> Merge branch 'jdbc-to-arrow-config' into jdbc-column-metadata
df632e3 <Mike Pigott> Updating the SQL tests to include JdbcToArrowConfig versions.
b270044 <Mike Pigott> Updated validaton & documentation, and unit tests for the new JdbcToArrowConfig.
da77cbe <Mike Pigott> Creating a configuration class for the JDBC-to-Arrow converter.
a78c770 <Mike Pigott> Updating Javadocs.
523387f <Mike Pigott> Updating the API to support an optional 'includeMetadata' field.
5af1b5b <Mike Pigott> Separating out the field-type creation from the field creation.
  • Loading branch information
Mike Pigott authored and xhochy committed Feb 8, 2019
1 parent 5863a9f commit 1cf4cdd
Show file tree
Hide file tree
Showing 12 changed files with 295 additions and 42 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.arrow.adapter.jdbc;

public class Constants {

public static final String SQL_CATALOG_NAME_KEY = "SQL_CATALOG_NAME";
public static final String SQL_TABLE_NAME_KEY = "SQL_TABLE_NAME";
public static final String SQL_COLUMN_NAME_KEY = "SQL_COLUMN_NAME";
public static final String SQL_TYPE_KEY = "SQL_TYPE";

}
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@
import java.sql.SQLException;
import java.sql.Statement;
import java.util.Calendar;
import java.util.Locale;
import java.util.TimeZone;

import org.apache.arrow.memory.BaseAllocator;
import org.apache.arrow.memory.RootAllocator;
Expand Down Expand Up @@ -90,7 +88,7 @@ public static VectorSchemaRoot sqlToArrow(Connection connection, String query, B
Preconditions.checkNotNull(allocator, "Memory allocator object can not be null");

JdbcToArrowConfig config =
new JdbcToArrowConfig(allocator, Calendar.getInstance(TimeZone.getTimeZone("UTC"), Locale.ROOT));
new JdbcToArrowConfig(allocator, JdbcToArrowUtils.getUtcCalendar(), false);
return sqlToArrow(connection, query, config);
}

Expand All @@ -112,12 +110,13 @@ public static VectorSchemaRoot sqlToArrow(
String query,
BaseAllocator allocator,
Calendar calendar) throws SQLException, IOException {

Preconditions.checkNotNull(connection, "JDBC connection object can not be null");
Preconditions.checkArgument(query != null && query.length() > 0, "SQL query can not be null or empty");
Preconditions.checkNotNull(allocator, "Memory allocator object can not be null");
Preconditions.checkNotNull(calendar, "Calendar object can not be null");

return sqlToArrow(connection, query, new JdbcToArrowConfig(allocator, calendar));
return sqlToArrow(connection, query, new JdbcToArrowConfig(allocator, calendar, false));
}

/**
Expand Down Expand Up @@ -154,7 +153,7 @@ public static VectorSchemaRoot sqlToArrow(Connection connection, String query, J
public static VectorSchemaRoot sqlToArrow(ResultSet resultSet) throws SQLException, IOException {
Preconditions.checkNotNull(resultSet, "JDBC ResultSet object can not be null");

return sqlToArrow(resultSet, Calendar.getInstance(TimeZone.getTimeZone("UTC"), Locale.ROOT));
return sqlToArrow(resultSet, JdbcToArrowUtils.getUtcCalendar());
}

/**
Expand All @@ -171,7 +170,7 @@ public static VectorSchemaRoot sqlToArrow(ResultSet resultSet, BaseAllocator all
Preconditions.checkNotNull(allocator, "Memory Allocator object can not be null");

JdbcToArrowConfig config =
new JdbcToArrowConfig(allocator, Calendar.getInstance(TimeZone.getTimeZone("UTC"), Locale.ROOT));
new JdbcToArrowConfig(allocator, JdbcToArrowUtils.getUtcCalendar(), false);
return sqlToArrow(resultSet, config);
}

Expand All @@ -186,7 +185,7 @@ public static VectorSchemaRoot sqlToArrow(ResultSet resultSet, BaseAllocator all
public static VectorSchemaRoot sqlToArrow(ResultSet resultSet, Calendar calendar) throws SQLException, IOException {
Preconditions.checkNotNull(resultSet, "JDBC ResultSet object can not be null");

return sqlToArrow(resultSet, new JdbcToArrowConfig(new RootAllocator(Integer.MAX_VALUE), calendar));
return sqlToArrow(resultSet, new JdbcToArrowConfig(new RootAllocator(Integer.MAX_VALUE), calendar, false));
}

/**
Expand All @@ -198,12 +197,15 @@ public static VectorSchemaRoot sqlToArrow(ResultSet resultSet, Calendar calendar
* @return Arrow Data Objects {@link VectorSchemaRoot}
* @throws SQLException on error
*/
public static VectorSchemaRoot sqlToArrow(ResultSet resultSet, BaseAllocator allocator, Calendar calendar)
public static VectorSchemaRoot sqlToArrow(
ResultSet resultSet,
BaseAllocator allocator,
Calendar calendar)
throws SQLException, IOException {
Preconditions.checkNotNull(resultSet, "JDBC ResultSet object can not be null");
Preconditions.checkNotNull(allocator, "Memory Allocator object can not be null");

return sqlToArrow(resultSet, new JdbcToArrowConfig(allocator, calendar));
return sqlToArrow(resultSet, new JdbcToArrowConfig(allocator, calendar, false));
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,20 +37,23 @@
public final class JdbcToArrowConfig {
private Calendar calendar;
private BaseAllocator allocator;
private boolean includeMetadata;

/**
* Constructs a new configuration from the provided allocator and calendar. The <code>allocator</code>
* is used when constructing the Arrow vectors from the ResultSet, and the calendar is used to define
* Arrow Timestamp fields, and to read time-based fields from the JDBC <code>ResultSet</code>.
*
* @param allocator The memory allocator to construct the Arrow vectors with.
* @param calendar The calendar to use when constructing Timestamp fields and reading time-based results.
* @param allocator The memory allocator to construct the Arrow vectors with.
* @param calendar The calendar to use when constructing Timestamp fields and reading time-based results.
* @param includeMetadata Whether to include JDBC field metadata in the Arrow Schema Field metadata.
*/
JdbcToArrowConfig(BaseAllocator allocator, Calendar calendar) {
JdbcToArrowConfig(BaseAllocator allocator, Calendar calendar, boolean includeMetadata) {
Preconditions.checkNotNull(allocator, "Memory allocator cannot be null");

this.allocator = allocator;
this.calendar = calendar;
this.includeMetadata = includeMetadata;
}

/**
Expand All @@ -70,4 +73,13 @@ public Calendar getCalendar() {
public BaseAllocator getAllocator() {
return allocator;
}

/**
* Whether to include JDBC ResultSet field metadata in the Arrow Schema field metadata.
*
* @return <code>true</code> to include field metadata, <code>false</code> to exclude it.
*/
public boolean shouldIncludeMetadata() {
return includeMetadata;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
public class JdbcToArrowConfigBuilder {
private Calendar calendar;
private BaseAllocator allocator;
private boolean includeMetadata;

/**
* Default constructor for the <code>JdbcToArrowConfigBuilder}</code>.
Expand All @@ -38,6 +39,7 @@ public class JdbcToArrowConfigBuilder {
public JdbcToArrowConfigBuilder() {
this.allocator = null;
this.calendar = null;
this.includeMetadata = false;
}

/**
Expand All @@ -62,6 +64,32 @@ public JdbcToArrowConfigBuilder(BaseAllocator allocator, Calendar calendar) {

this.allocator = allocator;
this.calendar = calendar;
this.includeMetadata = false;
}

/**
* Constructor for the <code>JdbcToArrowConfigBuilder</code>. Both the
* allocator and calendar are required. A {@link NullPointerException}
* will be thrown if either of those arguments is <code>null</code>.
* <p>
* The allocator is used to construct Arrow vectors from the JDBC ResultSet.
* The calendar is used to determine the time zone of {@link java.sql.Timestamp}
* fields and convert {@link java.sql.Date}, {@link java.sql.Time}, and
* {@link java.sql.Timestamp} fields to a single, common time zone when reading
* from the result set.
* </p>
* <p>
* The <code>includeMetadata</code> argument, if <code>true</code> will cause
* various information about each database field to be added to the Vector
* Schema's field metadata.
* </p>
*
* @param allocator The Arrow Vector memory allocator.
* @param calendar The calendar to use when constructing timestamp fields.
*/
public JdbcToArrowConfigBuilder(BaseAllocator allocator, Calendar calendar, boolean includeMetadata) {
this(allocator, calendar);
this.includeMetadata = includeMetadata;
}

/**
Expand All @@ -87,6 +115,17 @@ public JdbcToArrowConfigBuilder setCalendar(Calendar calendar) {
return this;
}

/**
* Sets whether to include JDBC ResultSet field metadata in the Arrow Schema field metadata.
*
* @param includeMetadata Whether to include or exclude JDBC metadata in the Arrow Schema field metadata.
* @return This instance of the <code>JdbcToArrowConfig</code>, for chaining.
*/
public JdbcToArrowConfigBuilder setIncludeMetadata(boolean includeMetadata) {
this.includeMetadata = includeMetadata;
return this;
}

/**
* This builds the {@link JdbcToArrowConfig} from the provided
* {@link BaseAllocator} and {@link Calendar}.
Expand All @@ -95,6 +134,6 @@ public JdbcToArrowConfigBuilder setCalendar(Calendar calendar) {
* @throws NullPointerException if either the allocator or calendar was not set.
*/
public JdbcToArrowConfig build() {
return new JdbcToArrowConfig(allocator, calendar);
return new JdbcToArrowConfig(allocator, calendar, includeMetadata);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,11 @@
import java.sql.Types;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.TimeZone;

import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.BaseFixedWidthVector;
Expand Down Expand Up @@ -103,7 +107,14 @@ public static Schema jdbcToArrowSchema(ResultSetMetaData rsmd, Calendar calendar
Preconditions.checkNotNull(rsmd, "JDBC ResultSetMetaData object can't be null");
Preconditions.checkNotNull(calendar, "Calendar object can't be null");

return jdbcToArrowSchema(rsmd, new JdbcToArrowConfig(new RootAllocator(0), calendar));
return jdbcToArrowSchema(rsmd, new JdbcToArrowConfig(new RootAllocator(0), calendar, false));
}

/**
* Returns the instance of a {java.util.Calendar} with the UTC time zone and root locale.
*/
public static Calendar getUtcCalendar() {
return Calendar.getInstance(TimeZone.getTimeZone("UTC"), Locale.ROOT);
}

/**
Expand Down Expand Up @@ -145,78 +156,103 @@ public static Schema jdbcToArrowSchema(ResultSetMetaData rsmd, JdbcToArrowConfig
Preconditions.checkNotNull(rsmd, "JDBC ResultSetMetaData object can't be null");
Preconditions.checkNotNull(config, "The configuration object must not be null");

final String timezone;
if (config.getCalendar() != null) {
timezone = config.getCalendar().getTimeZone().getID();
} else {
timezone = null;
}

List<Field> fields = new ArrayList<>();
int columnCount = rsmd.getColumnCount();
for (int i = 1; i <= columnCount; i++) {
String columnName = rsmd.getColumnName(i);
final String columnName = rsmd.getColumnName(i);
final FieldType fieldType;

final Map<String, String> metadata;
if (config.shouldIncludeMetadata()) {
metadata = new HashMap<>();
metadata.put(Constants.SQL_CATALOG_NAME_KEY, rsmd.getCatalogName(i));
metadata.put(Constants.SQL_TABLE_NAME_KEY, rsmd.getTableName(i));
metadata.put(Constants.SQL_COLUMN_NAME_KEY, columnName);
metadata.put(Constants.SQL_TYPE_KEY, rsmd.getColumnTypeName(i));

} else {
metadata = null;
}

switch (rsmd.getColumnType(i)) {
case Types.BOOLEAN:
case Types.BIT:
fields.add(new Field(columnName, FieldType.nullable(new ArrowType.Bool()), null));
fieldType = new FieldType(true, new ArrowType.Bool(), null, metadata);
break;
case Types.TINYINT:
fields.add(new Field(columnName, FieldType.nullable(new ArrowType.Int(8, true)), null));
fieldType = new FieldType(true, new ArrowType.Int(8, true), null, metadata);
break;
case Types.SMALLINT:
fields.add(new Field(columnName, FieldType.nullable(new ArrowType.Int(16, true)), null));
fieldType = new FieldType(true, new ArrowType.Int(16, true), null, metadata);
break;
case Types.INTEGER:
fields.add(new Field(columnName, FieldType.nullable(new ArrowType.Int(32, true)), null));
fieldType = new FieldType(true, new ArrowType.Int(32, true), null, metadata);
break;
case Types.BIGINT:
fields.add(new Field(columnName, FieldType.nullable(new ArrowType.Int(64, true)), null));
fieldType = new FieldType(true, new ArrowType.Int(64, true), null, metadata);
break;
case Types.NUMERIC:
case Types.DECIMAL:
int precision = rsmd.getPrecision(i);
int scale = rsmd.getScale(i);
fields.add(new Field(columnName, FieldType.nullable(new ArrowType.Decimal(precision, scale)), null));
fieldType = new FieldType(true, new ArrowType.Decimal(precision, scale), null, metadata);
break;
case Types.REAL:
case Types.FLOAT:
fields.add(new Field(columnName, FieldType.nullable(new ArrowType.FloatingPoint(SINGLE)), null));
fieldType = new FieldType(true, new ArrowType.FloatingPoint(SINGLE), null, metadata);
break;
case Types.DOUBLE:
fields.add(new Field(columnName, FieldType.nullable(new ArrowType.FloatingPoint(DOUBLE)), null));
fieldType = new FieldType(true, new ArrowType.FloatingPoint(DOUBLE), null, metadata);
break;
case Types.CHAR:
case Types.NCHAR:
case Types.VARCHAR:
case Types.NVARCHAR:
case Types.LONGVARCHAR:
case Types.LONGNVARCHAR:
fields.add(new Field(columnName, FieldType.nullable(new ArrowType.Utf8()), null));
case Types.CLOB:
fieldType = new FieldType(true, new ArrowType.Utf8(), null, metadata);
break;
case Types.DATE:
fields.add(new Field(columnName, FieldType.nullable(new ArrowType.Date(DateUnit.MILLISECOND)), null));
fieldType = new FieldType(true, new ArrowType.Date(DateUnit.MILLISECOND), null, metadata);
break;
case Types.TIME:
fields.add(new Field(columnName, FieldType.nullable(new ArrowType.Time(TimeUnit.MILLISECOND, 32)), null));
fieldType = new FieldType(true, new ArrowType.Time(TimeUnit.MILLISECOND, 32), null, metadata);
break;
case Types.TIMESTAMP:
fields.add(new Field(columnName, FieldType.nullable(new ArrowType.Timestamp(TimeUnit.MILLISECOND,
config.getCalendar().getTimeZone().getID())), null));
fieldType =
new FieldType(
true,
new ArrowType.Timestamp(TimeUnit.MILLISECOND, timezone),
null,
metadata);
break;
case Types.BINARY:
case Types.VARBINARY:
case Types.LONGVARBINARY:
fields.add(new Field(columnName, FieldType.nullable(new ArrowType.Binary()), null));
break;
case Types.ARRAY:
// TODO Need to handle this type
// fields.add(new Field("list", FieldType.nullable(new ArrowType.List()), null));
break;
case Types.CLOB:
fields.add(new Field(columnName, FieldType.nullable(new ArrowType.Utf8()), null));
break;
case Types.BLOB:
fields.add(new Field(columnName, FieldType.nullable(new ArrowType.Binary()), null));
fieldType = new FieldType(true, new ArrowType.Binary(), null, metadata);
break;

case Types.ARRAY:
// TODO Need to handle this type
// fields.add(new Field("list", FieldType.nullable(new ArrowType.List()), null));
default:
// no-op, shouldn't get here
fieldType = null;
break;
}

if (fieldType != null) {
fields.add(new Field(columnName, fieldType, null));
}
}

return new Schema(fields, null);
Expand Down Expand Up @@ -250,7 +286,7 @@ public static void jdbcToArrowVectors(ResultSet rs, VectorSchemaRoot root, Calen
Preconditions.checkNotNull(rs, "JDBC ResultSet object can't be null");
Preconditions.checkNotNull(root, "JDBC ResultSet object can't be null");

jdbcToArrowVectors(rs, root, new JdbcToArrowConfig(new RootAllocator(0), calendar));
jdbcToArrowVectors(rs, root, new JdbcToArrowConfig(new RootAllocator(0), calendar, false));
}

/**
Expand Down

0 comments on commit 1cf4cdd

Please sign in to comment.