Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-33526: [R] Implement new function open_dataset_csv with signature more closely matching read_csv_arrow #33614

Merged
merged 36 commits into from
Jan 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
f7c63cf
Add open_csv_dataset function
thisisnic Jan 11, 2023
c7578ac
Remove read_csv_arrow parameters which are not implemented for datase…
thisisnic Jan 11, 2023
3e932c0
Move CSV*Options setup into function body instead of param setup
thisisnic Jan 11, 2023
e057b81
Fix typo
thisisnic Jan 11, 2023
cb2b365
Remove redundant assignment
thisisnic Jan 11, 2023
3cbf7a8
Fix typo
thisisnic Jan 11, 2023
cc16968
Fix typo
thisisnic Jan 11, 2023
df25d61
Add open_csv_dataset to pkgdown
thisisnic Jan 11, 2023
5aa4525
Add back in missing arguments
thisisnic Jan 11, 2023
aeb1dbf
Add tests
thisisnic Jan 11, 2023
1528b95
Hook up na to null_values
thisisnic Jan 11, 2023
63f223f
Chuck in unimplemented functions and errors
thisisnic Jan 11, 2023
fa50ef0
Add in quoted_na to signature
thisisnic Jan 11, 2023
72f2e7e
Test on NAs, run document()
thisisnic Jan 12, 2023
677ca8e
Shrink function signature and refactor opts to parse_options
thisisnic Jan 12, 2023
e86d84f
Refactor args checking code to pull out into functions
thisisnic Jan 12, 2023
d8ae2c9
Fix numerous typos
thisisnic Jan 12, 2023
cf95f94
Improve as_data_frame error message
thisisnic Jan 12, 2023
7f858ea
Improve col_select error
thisisnic Jan 12, 2023
983362e
Update docs
thisisnic Jan 12, 2023
40199c0
Appease linter
thisisnic Jan 12, 2023
761afd3
Remove unused params
thisisnic Jan 16, 2023
26f9f1b
Update docs for CsvFileFormat$create
thisisnic Jan 16, 2023
fb087a9
Add notes to open_csv_dataset about unsupported options
thisisnic Jan 16, 2023
fdc2658
Update docs and run devtools::document()
thisisnic Jan 16, 2023
059fe5e
Add delim param
thisisnic Jan 16, 2023
0468160
Add new functions to pkgdown
thisisnic Jan 16, 2023
7d6b1ce
Add CsvFileFormat to pkgdown
thisisnic Jan 16, 2023
c85808a
Add example to docs showing switching between read_csv_arrow and open…
thisisnic Jan 16, 2023
54b529d
Fix error where col_types wasn't being passed through
thisisnic Jan 16, 2023
785db46
Add tests for more params
thisisnic Jan 16, 2023
29da6a8
Add documentation to helper func
thisisnic Jan 17, 2023
b710eb8
Unlink at end
thisisnic Jan 17, 2023
3388076
Realign
thisisnic Jan 17, 2023
b3c27a0
Remove skip in windows and hope for the best
thisisnic Jan 17, 2023
e0e8411
Update tests and docs
thisisnic Jan 17, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions r/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,10 @@ export(new_extension_type)
export(null)
export(num_range)
export(one_of)
export(open_csv_dataset)
export(open_dataset)
export(open_delim_dataset)
export(open_tsv_dataset)
export(read_csv_arrow)
export(read_delim_arrow)
export(read_feather)
Expand Down
2 changes: 1 addition & 1 deletion r/R/csv.R
Original file line number Diff line number Diff line change
Expand Up @@ -500,7 +500,7 @@ CsvWriteOptions$create <- function(include_header = TRUE, batch_size = 1024L, nu
)
}

readr_to_csv_read_options <- function(skip = 0, col_names = TRUE, col_types = NULL) {
readr_to_csv_read_options <- function(skip = 0, col_names = TRUE) {
thisisnic marked this conversation as resolved.
Show resolved Hide resolved
if (isTRUE(col_names)) {
# C++ default to parse is 0-length string array
col_names <- character(0)
Expand Down
252 changes: 168 additions & 84 deletions r/R/dataset-format.R
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
#' It returns the appropriate subclass of `FileFormat` (e.g. `ParquetFileFormat`)
#' @rdname FileFormat
#' @name FileFormat
#' @examplesIf arrow_with_dataset() && tolower(Sys.info()[["sysname"]]) != "windows"
#' @examplesIf arrow_with_dataset()
#' ## Semi-colon delimited files
#' # Set up directory for examples
#' tf <- tempfile()
Expand Down Expand Up @@ -113,107 +113,105 @@ ParquetFileFormat$create <- function(...,
#' @export
IpcFileFormat <- R6Class("IpcFileFormat", inherit = FileFormat)

#' @usage NULL
#' @format NULL
#' @rdname FileFormat
#' CSV dataset file format
#'
#' @description
#' A `CSVFileFormat` is a [FileFormat] subclass which holds information about how to
#' read and parse the files included in a CSV `Dataset`.
#'
#' @section Factory:
#' `CSVFileFormat$create()` can take options in the form of lists passed through as `parse_options`,
#' `read_options`, or `convert_options` parameters. Alternatively, readr-style options can be passed
#' through individually. While it is possible to pass in `CSVReadOptions`, `CSVConvertOptions`, and `CSVParseOptions`
#' objects, this is not recommended as options set in these objects are not validated for compatibility.
#'
#' @return A `CsvFileFormat` object
#' @rdname CsvFileFormat
#' @name CsvFileFormat
#' @seealso [FileFormat]
#' @examplesIf arrow_with_dataset()
#' # Set up directory for examples
#' tf <- tempfile()
#' dir.create(tf)
#' on.exit(unlink(tf))
#' df <- data.frame(x = c("1", "2", "NULL"))
#' write.table(df, file.path(tf, "file1.txt"), sep = ",", row.names = FALSE)
#'
#' # Create CsvFileFormat object with Arrow-style null_values option
#' format <- CsvFileFormat$create(convert_options = list(null_values = c("", "NA", "NULL")))
#' open_dataset(tf, format = format)
#'
#' # Use readr-style options
#' format <- CsvFileFormat$create(na = c("", "NA", "NULL"))
#' open_dataset(tf, format = format)
#'
#' @export
CsvFileFormat <- R6Class("CsvFileFormat", inherit = FileFormat)
CsvFileFormat$create <- function(...,
opts = csv_file_format_parse_options(...),
convert_options = csv_file_format_convert_opts(...),
read_options = csv_file_format_read_opts(...)) {
check_csv_file_format_args(...)
# Evaluate opts first to catch any unsupported arguments
force(opts)

options <- list(...)
schema <- options[["schema"]]
if (!is.null(schema) && !inherits(schema, "Schema")) {
abort(paste0(
"`schema` must be an object of class 'Schema' not '",
class(schema)[1],
"'."
))
}

if (!inherits(read_options, "CsvReadOptions")) {
read_options <- do.call(CsvReadOptions$create, read_options)
}
CsvFileFormat$create <- function(...) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

whereby:

  • AFAIK this isn't intended to be used much by users directly,
  • but it is exported, so at least some power users probably will anyway,
  • and I appreciate the flattening here,
  • but whenever more parameters get shoved through ... it gets harder for users to figure out what they can specify there (and specifically here you'll have to dig through more levels of source to figure it out),

suggested:

  • could we document where to look in ?FileFormat a bit more?
  • could that page specify thatCsvFileFormat exists if this is going to @rdname there?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good points made; the refactoring I did here is because that bit of the code was overly complex, so in the name of usability, I think it'll make more sense to write actual independent docs for this function with more explicit information about its usage.

dots <- list(...)
options <- check_csv_file_format_args(dots)
check_schema(options[["schema"]], options[["read_options"]]$column_names)

if (!inherits(convert_options, "CsvConvertOptions")) {
convert_options <- do.call(CsvConvertOptions$create, convert_options)
}

if (!inherits(opts, "CsvParseOptions")) {
opts <- do.call(CsvParseOptions$create, opts)
}

column_names <- read_options$column_names
schema_names <- names(schema)
dataset___CsvFileFormat__Make(options$parse_options, options$convert_options, options$read_options)
}

if (!is.null(schema) && !identical(schema_names, column_names)) {
missing_from_schema <- setdiff(column_names, schema_names)
missing_from_colnames <- setdiff(schema_names, column_names)
message_colnames <- NULL
message_schema <- NULL
message_order <- NULL
# Check all arguments are valid
check_csv_file_format_args <- function(args) {
options <- list(
parse_options = args$parse_options,
convert_options = args$convert_options,
read_options = args$read_options,
schema = args$schema
)

if (length(missing_from_colnames) > 0) {
message_colnames <- paste(
oxford_paste(missing_from_colnames, quote_symbol = "`"),
"not present in `column_names`"
)
}
check_unsupported_args(args)
check_unrecognised_args(args)

if (length(missing_from_schema) > 0) {
message_schema <- paste(
oxford_paste(missing_from_schema, quote_symbol = "`"),
"not present in `schema`"
)
}
# Evaluate parse_options first to catch any unsupported arguments
if (is.null(args$parse_options)) {
options$parse_options <- do.call(csv_file_format_parse_opts, args)
} else if (is.list(args$parse_options)) {
options$parse_options <- do.call(CsvParseOptions$create, args$parse_options)
}

if (length(missing_from_schema) == 0 && length(missing_from_colnames) == 0) {
message_order <- "`column_names` and `schema` field names match but are not in the same order"
}
if (is.null(args$convert_options)) {
options$convert_options <- do.call(csv_file_format_convert_opts, args)
} else if (is.list(args$convert_options)) {
options$convert_options <- do.call(CsvConvertOptions$create, args$convert_options)
}

abort(
c(
"Values in `column_names` must match `schema` field names",
x = message_order,
x = message_schema,
x = message_colnames
)
)
if (is.null(args$read_options)) {
options$read_options <- do.call(csv_file_format_read_opts, args)
} else if (is.list(args$read_options)) {
options$read_options <- do.call(CsvReadOptions$create, args$read_options)
}

dataset___CsvFileFormat__Make(opts, convert_options, read_options)
options
}

# Check all arguments are valid
check_csv_file_format_args <- function(...) {
opts <- list(...)
check_unsupported_args <- function(args) {
opt_names <- get_opt_names(args)

# Filter out arguments meant for CsvConvertOptions/CsvReadOptions
convert_opts <- c(names(formals(CsvConvertOptions$create)))
supported_convert_opts <- c(names(formals(CsvConvertOptions$create)), "na")

read_opts <- c(
supported_read_opts <- c(
names(formals(CsvReadOptions$create)),
names(formals(readr_to_csv_read_options))
)

# We only currently support all of the readr options for parseoptions
parse_opts <- c(
supported_parse_opts <- c(
names(formals(CsvParseOptions$create)),
names(formals(readr_to_csv_parse_options))
)

opt_names <- names(opts)

# Catch any readr-style options specified with full option names that are
# supported by read_delim_arrow() (and its wrappers) but are not yet
# supported here
unsup_readr_opts <- setdiff(
names(formals(read_delim_arrow)),
c(convert_opts, read_opts, parse_opts, "schema")
c(supported_convert_opts, supported_read_opts, supported_parse_opts, "schema")
)

is_unsup_opt <- opt_names %in% unsup_readr_opts
Expand All @@ -228,9 +226,36 @@ check_csv_file_format_args <- function(...) {
call. = FALSE
)
}
}

# unlists "parse_options", "convert_options", "read_options" and returns them along with
# names of options passed in individually via args. `get_opt_names()` ignores any
# CSV*Options objects passed in as these are not validated - users must ensure they've
# chosen reasonable values in this case.
get_opt_names <- function(args) {
opt_names <- names(args)

# extract names of parse_options, read_options, and convert_options
if ("parse_options" %in% names(args) && is.list(args[["parse_options"]])) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What happens if args[["parse_options"]] is not a list? (Probably should be in the comments since I can't tell by reading this what it should or should not be)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

args[["parse_options"]] can either be a list of options, or a CsvParseOptions object. This helper function is only designed to validate the former; the latter is more advanced/off-label usage and it's up to users to safely pass the correct arguments through here (you could argue that this shouldn't be the case, but doing it this way is consistent with how we've done this in other bits of the codebase).

I thought about documenting this, but I don't know where it would go - I don't think inside this function?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You could redocument #' @param parse_options in open_delim_dataset()? (Or add to the existing #' @param parse_options if the behaviour is the same for both open_delim_dataset() and read_delim_arrow())

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it's off-label usage then you probably should error for it...with the new function we have the opportunity to protect users from off-label usage (which was very very easy to do with the existing open_dataset()). Passing parse_options = CsvParseOptions$create(...) was the first thing I thought to do and so I imagine somebody else might think to do it, too.

Copy link
Member Author

@thisisnic thisisnic Jan 17, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

parse_options = CsvParseOptions$create(...) is the way that users have been doing things thus far as passing in lists is new - this additional way of doing things only got merged in in the past few weeks. I'm not sure what the right call is here; I don't want to error on passing in a CsvParseOptions object and break folks' existing code but do want to encourage passing in options as a list so it actually hits our validation code.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see...let's consider that later, then, but make it clear in this function that those are the two options for future us trying to read this code.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added more info here, and more explicit acknowledgement of this to CsvFileFormat$create()

opt_names <- c(opt_names, names(args[["parse_options"]]))
}

if ("read_options" %in% names(args) && is.list(args[["read_options"]])) {
opt_names <- c(opt_names, names(args[["read_options"]]))
}

if ("convert_options" %in% names(args) && is.list(args[["convert_options"]])) {
opt_names <- c(opt_names, names(args[["convert_options"]]))
}

setdiff(opt_names, c("parse_options", "read_options", "convert_options"))
}

check_unrecognised_args <- function(opts) {
# Catch any options with full or partial names that do not match any of the
# recognized Arrow C++ option names or readr-style option names
opt_names <- get_opt_names(opts)

arrow_opts <- c(
names(formals(CsvParseOptions$create)),
names(formals(CsvReadOptions$create)),
Expand All @@ -240,7 +265,8 @@ check_csv_file_format_args <- function(...) {

readr_opts <- c(
names(formals(readr_to_csv_parse_options)),
names(formals(readr_to_csv_read_options))
names(formals(readr_to_csv_read_options)),
"na"
)

is_arrow_opt <- !is.na(pmatch(opt_names, arrow_opts))
Expand Down Expand Up @@ -271,26 +297,74 @@ check_ambiguous_options <- function(passed_opts, opts1, opts2) {
}
}

check_schema <- function(schema, column_names) {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function is all existing code which I just extracted out into its own function

if (!is.null(schema) && !inherits(schema, "Schema")) {
abort(paste0(
"`schema` must be an object of class 'Schema' not '",
class(schema)[1],
"'."
))
}

schema_names <- names(schema)

if (!is.null(schema) && !identical(schema_names, column_names)) {
missing_from_schema <- setdiff(column_names, schema_names)
missing_from_colnames <- setdiff(schema_names, column_names)
message_colnames <- NULL
message_schema <- NULL
message_order <- NULL

if (length(missing_from_colnames) > 0) {
message_colnames <- paste(
oxford_paste(missing_from_colnames, quote_symbol = "`"),
"not present in `column_names`"
)
}

if (length(missing_from_schema) > 0) {
message_schema <- paste(
oxford_paste(missing_from_schema, quote_symbol = "`"),
"not present in `schema`"
)
}

if (length(missing_from_schema) == 0 && length(missing_from_colnames) == 0) {
message_order <- "`column_names` and `schema` field names match but are not in the same order"
}

abort(
c(
"Values in `column_names` must match `schema` field names",
x = message_order,
x = message_schema,
x = message_colnames
)
)
}
}

# Support both readr-style option names and Arrow C++ option names
csv_file_format_parse_options <- function(...) {
csv_file_format_parse_opts <- function(...) {
opts <- list(...)
# Filter out arguments meant for CsvConvertOptions/CsvReadOptions
convert_opts <- names(formals(CsvConvertOptions$create))
convert_opts <- c(names(formals(CsvConvertOptions$create)), "na", "convert_options")
read_opts <- c(
names(formals(CsvReadOptions$create)),
names(formals(readr_to_csv_read_options))
names(formals(readr_to_csv_read_options)),
"read_options"
)
opts[convert_opts] <- NULL
opts[read_opts] <- NULL
opts[["schema"]] <- NULL
opt_names <- names(opts)
opts[["parse_options"]] <- NULL
opt_names <- get_opt_names(opts)

arrow_opts <- c(names(formals(CsvParseOptions$create)))
readr_opts <- c(names(formals(readr_to_csv_parse_options)))

is_arrow_opt <- !is.na(pmatch(opt_names, arrow_opts))
is_readr_opt <- !is.na(pmatch(opt_names, readr_opts))

# Catch options with ambiguous partial names (such as "del") that make it
# unclear whether the user is specifying Arrow C++ options ("delimiter") or
# readr-style options ("delim")
Expand All @@ -313,28 +387,38 @@ csv_file_format_parse_options <- function(...) {
csv_file_format_convert_opts <- function(...) {
opts <- list(...)
# Filter out arguments meant for CsvParseOptions/CsvReadOptions
arrow_opts <- names(formals(CsvParseOptions$create))
arrow_opts <- c(names(formals(CsvParseOptions$create)), "parse_options")
readr_opts <- names(formals(readr_to_csv_parse_options))
read_opts <- c(
names(formals(CsvReadOptions$create)),
names(formals(readr_to_csv_read_options))
names(formals(readr_to_csv_read_options)),
"read_options"
)
opts[arrow_opts] <- NULL
opts[readr_opts] <- NULL
opts[read_opts] <- NULL
opts[["schema"]] <- NULL
opts[["convert_options"]] <- NULL

# map "na" to "null_values"
if ("na" %in% names(opts)) {
opts[["null_values"]] <- opts[["na"]]
opts[["na"]] <- NULL
}

do.call(CsvConvertOptions$create, opts)
}

csv_file_format_read_opts <- function(schema = NULL, ...) {
opts <- list(...)
# Filter out arguments meant for CsvParseOptions/CsvConvertOptions
arrow_opts <- names(formals(CsvParseOptions$create))
arrow_opts <- c(names(formals(CsvParseOptions$create)), "parse_options")
readr_opts <- names(formals(readr_to_csv_parse_options))
convert_opts <- names(formals(CsvConvertOptions$create))
convert_opts <- c(names(formals(CsvConvertOptions$create)), "na", "convert_options")
opts[arrow_opts] <- NULL
opts[readr_opts] <- NULL
opts[convert_opts] <- NULL
opts[["read_options"]] <- NULL

opt_names <- names(opts)
arrow_opts <- c(names(formals(CsvReadOptions$create)))
Expand Down