diff --git a/r/NAMESPACE b/r/NAMESPACE index cde81d977b79c..3df107a2d8fb6 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -348,7 +348,10 @@ export(new_extension_type) export(null) export(num_range) export(one_of) +export(open_csv_dataset) export(open_dataset) +export(open_delim_dataset) +export(open_tsv_dataset) export(read_csv_arrow) export(read_delim_arrow) export(read_feather) diff --git a/r/R/csv.R b/r/R/csv.R index 08f30fdefdf4c..135394b967730 100644 --- a/r/R/csv.R +++ b/r/R/csv.R @@ -500,7 +500,7 @@ CsvWriteOptions$create <- function(include_header = TRUE, batch_size = 1024L, nu ) } -readr_to_csv_read_options <- function(skip = 0, col_names = TRUE, col_types = NULL) { +readr_to_csv_read_options <- function(skip = 0, col_names = TRUE) { if (isTRUE(col_names)) { # C++ default to parse is 0-length string array col_names <- character(0) diff --git a/r/R/dataset-format.R b/r/R/dataset-format.R index 7b88d9b8e0f7e..0912941e648a4 100644 --- a/r/R/dataset-format.R +++ b/r/R/dataset-format.R @@ -53,7 +53,7 @@ #' It returns the appropriate subclass of `FileFormat` (e.g. `ParquetFileFormat`) #' @rdname FileFormat #' @name FileFormat -#' @examplesIf arrow_with_dataset() && tolower(Sys.info()[["sysname"]]) != "windows" +#' @examplesIf arrow_with_dataset() #' ## Semi-colon delimited files #' # Set up directory for examples #' tf <- tempfile() @@ -113,107 +113,105 @@ ParquetFileFormat$create <- function(..., #' @export IpcFileFormat <- R6Class("IpcFileFormat", inherit = FileFormat) -#' @usage NULL -#' @format NULL -#' @rdname FileFormat +#' CSV dataset file format +#' +#' @description +#' A `CSVFileFormat` is a [FileFormat] subclass which holds information about how to +#' read and parse the files included in a CSV `Dataset`. +#' +#' @section Factory: +#' `CSVFileFormat$create()` can take options in the form of lists passed through as `parse_options`, +#' `read_options`, or `convert_options` parameters. Alternatively, readr-style options can be passed +#' through individually. While it is possible to pass in `CSVReadOptions`, `CSVConvertOptions`, and `CSVParseOptions` +#' objects, this is not recommended as options set in these objects are not validated for compatibility. +#' +#' @return A `CsvFileFormat` object +#' @rdname CsvFileFormat +#' @name CsvFileFormat +#' @seealso [FileFormat] +#' @examplesIf arrow_with_dataset() +#' # Set up directory for examples +#' tf <- tempfile() +#' dir.create(tf) +#' on.exit(unlink(tf)) +#' df <- data.frame(x = c("1", "2", "NULL")) +#' write.table(df, file.path(tf, "file1.txt"), sep = ",", row.names = FALSE) +#' +#' # Create CsvFileFormat object with Arrow-style null_values option +#' format <- CsvFileFormat$create(convert_options = list(null_values = c("", "NA", "NULL"))) +#' open_dataset(tf, format = format) +#' +#' # Use readr-style options +#' format <- CsvFileFormat$create(na = c("", "NA", "NULL")) +#' open_dataset(tf, format = format) +#' #' @export CsvFileFormat <- R6Class("CsvFileFormat", inherit = FileFormat) -CsvFileFormat$create <- function(..., - opts = csv_file_format_parse_options(...), - convert_options = csv_file_format_convert_opts(...), - read_options = csv_file_format_read_opts(...)) { - check_csv_file_format_args(...) - # Evaluate opts first to catch any unsupported arguments - force(opts) - - options <- list(...) - schema <- options[["schema"]] - if (!is.null(schema) && !inherits(schema, "Schema")) { - abort(paste0( - "`schema` must be an object of class 'Schema' not '", - class(schema)[1], - "'." - )) - } - - if (!inherits(read_options, "CsvReadOptions")) { - read_options <- do.call(CsvReadOptions$create, read_options) - } +CsvFileFormat$create <- function(...) { + dots <- list(...) + options <- check_csv_file_format_args(dots) + check_schema(options[["schema"]], options[["read_options"]]$column_names) - if (!inherits(convert_options, "CsvConvertOptions")) { - convert_options <- do.call(CsvConvertOptions$create, convert_options) - } - - if (!inherits(opts, "CsvParseOptions")) { - opts <- do.call(CsvParseOptions$create, opts) - } - - column_names <- read_options$column_names - schema_names <- names(schema) + dataset___CsvFileFormat__Make(options$parse_options, options$convert_options, options$read_options) +} - if (!is.null(schema) && !identical(schema_names, column_names)) { - missing_from_schema <- setdiff(column_names, schema_names) - missing_from_colnames <- setdiff(schema_names, column_names) - message_colnames <- NULL - message_schema <- NULL - message_order <- NULL +# Check all arguments are valid +check_csv_file_format_args <- function(args) { + options <- list( + parse_options = args$parse_options, + convert_options = args$convert_options, + read_options = args$read_options, + schema = args$schema + ) - if (length(missing_from_colnames) > 0) { - message_colnames <- paste( - oxford_paste(missing_from_colnames, quote_symbol = "`"), - "not present in `column_names`" - ) - } + check_unsupported_args(args) + check_unrecognised_args(args) - if (length(missing_from_schema) > 0) { - message_schema <- paste( - oxford_paste(missing_from_schema, quote_symbol = "`"), - "not present in `schema`" - ) - } + # Evaluate parse_options first to catch any unsupported arguments + if (is.null(args$parse_options)) { + options$parse_options <- do.call(csv_file_format_parse_opts, args) + } else if (is.list(args$parse_options)) { + options$parse_options <- do.call(CsvParseOptions$create, args$parse_options) + } - if (length(missing_from_schema) == 0 && length(missing_from_colnames) == 0) { - message_order <- "`column_names` and `schema` field names match but are not in the same order" - } + if (is.null(args$convert_options)) { + options$convert_options <- do.call(csv_file_format_convert_opts, args) + } else if (is.list(args$convert_options)) { + options$convert_options <- do.call(CsvConvertOptions$create, args$convert_options) + } - abort( - c( - "Values in `column_names` must match `schema` field names", - x = message_order, - x = message_schema, - x = message_colnames - ) - ) + if (is.null(args$read_options)) { + options$read_options <- do.call(csv_file_format_read_opts, args) + } else if (is.list(args$read_options)) { + options$read_options <- do.call(CsvReadOptions$create, args$read_options) } - dataset___CsvFileFormat__Make(opts, convert_options, read_options) + options } -# Check all arguments are valid -check_csv_file_format_args <- function(...) { - opts <- list(...) +check_unsupported_args <- function(args) { + opt_names <- get_opt_names(args) + # Filter out arguments meant for CsvConvertOptions/CsvReadOptions - convert_opts <- c(names(formals(CsvConvertOptions$create))) + supported_convert_opts <- c(names(formals(CsvConvertOptions$create)), "na") - read_opts <- c( + supported_read_opts <- c( names(formals(CsvReadOptions$create)), names(formals(readr_to_csv_read_options)) ) # We only currently support all of the readr options for parseoptions - parse_opts <- c( + supported_parse_opts <- c( names(formals(CsvParseOptions$create)), names(formals(readr_to_csv_parse_options)) ) - opt_names <- names(opts) - # Catch any readr-style options specified with full option names that are # supported by read_delim_arrow() (and its wrappers) but are not yet # supported here unsup_readr_opts <- setdiff( names(formals(read_delim_arrow)), - c(convert_opts, read_opts, parse_opts, "schema") + c(supported_convert_opts, supported_read_opts, supported_parse_opts, "schema") ) is_unsup_opt <- opt_names %in% unsup_readr_opts @@ -228,9 +226,36 @@ check_csv_file_format_args <- function(...) { call. = FALSE ) } +} + +# unlists "parse_options", "convert_options", "read_options" and returns them along with +# names of options passed in individually via args. `get_opt_names()` ignores any +# CSV*Options objects passed in as these are not validated - users must ensure they've +# chosen reasonable values in this case. +get_opt_names <- function(args) { + opt_names <- names(args) + + # extract names of parse_options, read_options, and convert_options + if ("parse_options" %in% names(args) && is.list(args[["parse_options"]])) { + opt_names <- c(opt_names, names(args[["parse_options"]])) + } + + if ("read_options" %in% names(args) && is.list(args[["read_options"]])) { + opt_names <- c(opt_names, names(args[["read_options"]])) + } + if ("convert_options" %in% names(args) && is.list(args[["convert_options"]])) { + opt_names <- c(opt_names, names(args[["convert_options"]])) + } + + setdiff(opt_names, c("parse_options", "read_options", "convert_options")) +} + +check_unrecognised_args <- function(opts) { # Catch any options with full or partial names that do not match any of the # recognized Arrow C++ option names or readr-style option names + opt_names <- get_opt_names(opts) + arrow_opts <- c( names(formals(CsvParseOptions$create)), names(formals(CsvReadOptions$create)), @@ -240,7 +265,8 @@ check_csv_file_format_args <- function(...) { readr_opts <- c( names(formals(readr_to_csv_parse_options)), - names(formals(readr_to_csv_read_options)) + names(formals(readr_to_csv_read_options)), + "na" ) is_arrow_opt <- !is.na(pmatch(opt_names, arrow_opts)) @@ -271,26 +297,74 @@ check_ambiguous_options <- function(passed_opts, opts1, opts2) { } } +check_schema <- function(schema, column_names) { + if (!is.null(schema) && !inherits(schema, "Schema")) { + abort(paste0( + "`schema` must be an object of class 'Schema' not '", + class(schema)[1], + "'." + )) + } + + schema_names <- names(schema) + + if (!is.null(schema) && !identical(schema_names, column_names)) { + missing_from_schema <- setdiff(column_names, schema_names) + missing_from_colnames <- setdiff(schema_names, column_names) + message_colnames <- NULL + message_schema <- NULL + message_order <- NULL + + if (length(missing_from_colnames) > 0) { + message_colnames <- paste( + oxford_paste(missing_from_colnames, quote_symbol = "`"), + "not present in `column_names`" + ) + } + + if (length(missing_from_schema) > 0) { + message_schema <- paste( + oxford_paste(missing_from_schema, quote_symbol = "`"), + "not present in `schema`" + ) + } + + if (length(missing_from_schema) == 0 && length(missing_from_colnames) == 0) { + message_order <- "`column_names` and `schema` field names match but are not in the same order" + } + + abort( + c( + "Values in `column_names` must match `schema` field names", + x = message_order, + x = message_schema, + x = message_colnames + ) + ) + } +} + # Support both readr-style option names and Arrow C++ option names -csv_file_format_parse_options <- function(...) { +csv_file_format_parse_opts <- function(...) { opts <- list(...) # Filter out arguments meant for CsvConvertOptions/CsvReadOptions - convert_opts <- names(formals(CsvConvertOptions$create)) + convert_opts <- c(names(formals(CsvConvertOptions$create)), "na", "convert_options") read_opts <- c( names(formals(CsvReadOptions$create)), - names(formals(readr_to_csv_read_options)) + names(formals(readr_to_csv_read_options)), + "read_options" ) opts[convert_opts] <- NULL opts[read_opts] <- NULL opts[["schema"]] <- NULL - opt_names <- names(opts) + opts[["parse_options"]] <- NULL + opt_names <- get_opt_names(opts) arrow_opts <- c(names(formals(CsvParseOptions$create))) readr_opts <- c(names(formals(readr_to_csv_parse_options))) is_arrow_opt <- !is.na(pmatch(opt_names, arrow_opts)) is_readr_opt <- !is.na(pmatch(opt_names, readr_opts)) - # Catch options with ambiguous partial names (such as "del") that make it # unclear whether the user is specifying Arrow C++ options ("delimiter") or # readr-style options ("delim") @@ -313,28 +387,38 @@ csv_file_format_parse_options <- function(...) { csv_file_format_convert_opts <- function(...) { opts <- list(...) # Filter out arguments meant for CsvParseOptions/CsvReadOptions - arrow_opts <- names(formals(CsvParseOptions$create)) + arrow_opts <- c(names(formals(CsvParseOptions$create)), "parse_options") readr_opts <- names(formals(readr_to_csv_parse_options)) read_opts <- c( names(formals(CsvReadOptions$create)), - names(formals(readr_to_csv_read_options)) + names(formals(readr_to_csv_read_options)), + "read_options" ) opts[arrow_opts] <- NULL opts[readr_opts] <- NULL opts[read_opts] <- NULL opts[["schema"]] <- NULL + opts[["convert_options"]] <- NULL + + # map "na" to "null_values" + if ("na" %in% names(opts)) { + opts[["null_values"]] <- opts[["na"]] + opts[["na"]] <- NULL + } + do.call(CsvConvertOptions$create, opts) } csv_file_format_read_opts <- function(schema = NULL, ...) { opts <- list(...) # Filter out arguments meant for CsvParseOptions/CsvConvertOptions - arrow_opts <- names(formals(CsvParseOptions$create)) + arrow_opts <- c(names(formals(CsvParseOptions$create)), "parse_options") readr_opts <- names(formals(readr_to_csv_parse_options)) - convert_opts <- names(formals(CsvConvertOptions$create)) + convert_opts <- c(names(formals(CsvConvertOptions$create)), "na", "convert_options") opts[arrow_opts] <- NULL opts[readr_opts] <- NULL opts[convert_opts] <- NULL + opts[["read_options"]] <- NULL opt_names <- names(opts) arrow_opts <- c(names(formals(CsvReadOptions$create))) diff --git a/r/R/dataset.R b/r/R/dataset.R index 732c05ecb0104..71247b3581ee9 100644 --- a/r/R/dataset.R +++ b/r/R/dataset.R @@ -228,6 +228,130 @@ open_dataset <- function(sources, ) } +#' Open a multi-file dataset of CSV or other delimiter-separated format +#' +#' A wrapper around [open_dataset] which explicitly includes parameters mirroring [read_csv_arrow()], +#' [read_delim_arrow()], and [read_tsv_arrow()] to allows for easy switching between functions +#' for opening single files and functions for opening datasets. +#' +#' @inheritParams open_dataset +#' @inheritParams read_delim_arrow +#' +#' @section Options currently supported by [read_delim_arrow()] which are not supported here: +#' * `file` (instead, please specify files in `sources`) +#' * `col_select` (instead, subset columns after dataset creation) +#' * `quoted_na` +#' * `as_data_frame` (instead, convert to data frame after dataset creation) +#' * `parse_options` +#' +#' @examplesIf arrow_with_dataset() +#' # Set up directory for examples +#' tf <- tempfile() +#' dir.create(tf) +#' df <- data.frame(x = c("1", "2", "NULL")) +#' +#' file_path <- file.path(tf, "file1.txt") +#' write.table(df, file_path, sep = ",", row.names = FALSE) +#' +#' read_csv_arrow(file_path, na = c("", "NA", "NULL"), col_names = "y", skip = 1) +#' open_csv_dataset(file_path, na = c("", "NA", "NULL"), col_names = "y", skip = 1) +#' +#' unlink(tf) +#' @seealso [open_dataset()] +#' @export +open_delim_dataset <- function(sources, + schema = NULL, + partitioning = hive_partition(), + hive_style = NA, + unify_schemas = NULL, + factory_options = list(), + delim = ",", + quote = "\"", + escape_double = TRUE, + escape_backslash = FALSE, + col_names = TRUE, + col_types = NULL, + na = c("", "NA"), + skip_empty_rows = TRUE, + skip = 0L, + convert_options = NULL, + read_options = NULL, + timestamp_parsers = NULL) { + open_dataset( + sources = sources, + schema = schema, + partitioning = partitioning, + hive_style = hive_style, + unify_schemas = unify_schemas, + factory_options = factory_options, + format = "text", + delim = delim, + quote = quote, + escape_double = escape_double, + escape_backslash = escape_backslash, + col_names = col_names, + col_types = col_types, + na = na, + skip_empty_rows = skip_empty_rows, + skip = skip, + convert_options = convert_options, + read_options = read_options, + timestamp_parsers = timestamp_parsers + ) +} + +#' @rdname open_delim_dataset +#' @export +open_csv_dataset <- function(sources, + schema = NULL, + partitioning = hive_partition(), + hive_style = NA, + unify_schemas = NULL, + factory_options = list(), + quote = "\"", + escape_double = TRUE, + escape_backslash = FALSE, + col_names = TRUE, + col_types = NULL, + na = c("", "NA"), + skip_empty_rows = TRUE, + skip = 0L, + convert_options = NULL, + read_options = NULL, + timestamp_parsers = NULL) { + mc <- match.call() + mc$delim <- "," + mc[[1]] <- get("open_delim_dataset", envir = asNamespace("arrow")) + eval.parent(mc) +} + +#' @rdname open_delim_dataset +#' @export +open_tsv_dataset <- function(sources, + schema = NULL, + partitioning = hive_partition(), + hive_style = NA, + unify_schemas = NULL, + factory_options = list(), + quote = "\"", + escape_double = TRUE, + escape_backslash = FALSE, + col_names = TRUE, + col_types = NULL, + na = c("", "NA"), + skip_empty_rows = TRUE, + skip = 0L, + convert_options = NULL, + read_options = NULL, + timestamp_parsers = NULL) { + mc <- match.call() + mc$delim <- "\t" + mc[[1]] <- get("open_delim_dataset", envir = asNamespace("arrow")) + eval.parent(mc) +} + + + #' Multi-file datasets #' #' @description diff --git a/r/_pkgdown.yml b/r/_pkgdown.yml index 3d5dc2d1f259a..391d3407694d5 100644 --- a/r/_pkgdown.yml +++ b/r/_pkgdown.yml @@ -141,6 +141,9 @@ reference: - title: Multi-file datasets contents: - open_dataset + - open_delim_dataset + - open_csv_dataset + - open_tsv_dataset - write_dataset - dataset_factory - hive_partition @@ -149,6 +152,7 @@ reference: - Expression - Scanner - FileFormat + - CsvFileFormat - FileWriteOptions - FragmentScanOptions - map_batches diff --git a/r/man/CsvFileFormat.Rd b/r/man/CsvFileFormat.Rd new file mode 100644 index 0000000000000..aa368b8f29d09 --- /dev/null +++ b/r/man/CsvFileFormat.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataset-format.R +\name{CsvFileFormat} +\alias{CsvFileFormat} +\title{CSV dataset file format} +\value{ +A \code{CsvFileFormat} object +} +\description{ +A \code{CSVFileFormat} is a \link{FileFormat} subclass which holds information about how to +read and parse the files included in a CSV \code{Dataset}. +} +\section{Factory}{ + +\code{CSVFileFormat$create()} can take options in the form of lists passed through as \code{parse_options}, +\code{read_options}, or \code{convert_options} parameters. Alternatively, readr-style options can be passed +through individually. While it is possible to pass in \code{CSVReadOptions}, \code{CSVConvertOptions}, and \code{CSVParseOptions} +objects, this is not recommended as options set in these objects are not validated for compatibility. +} + +\examples{ +\dontshow{if (arrow_with_dataset()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +# Set up directory for examples +tf <- tempfile() +dir.create(tf) +on.exit(unlink(tf)) +df <- data.frame(x = c("1", "2", "NULL")) +write.table(df, file.path(tf, "file1.txt"), sep = ",", row.names = FALSE) + +# Create CsvFileFormat object with Arrow-style null_values option +format <- CsvFileFormat$create(convert_options = list(null_values = c("", "NA", "NULL"))) +open_dataset(tf, format = format) + +# Use readr-style options +format <- CsvFileFormat$create(na = c("", "NA", "NULL")) +open_dataset(tf, format = format) +\dontshow{\}) # examplesIf} +} +\seealso{ +\link{FileFormat} +} diff --git a/r/man/FileFormat.Rd b/r/man/FileFormat.Rd index 3c6fd330b0163..296de02ead2e5 100644 --- a/r/man/FileFormat.Rd +++ b/r/man/FileFormat.Rd @@ -4,7 +4,6 @@ \alias{FileFormat} \alias{ParquetFileFormat} \alias{IpcFileFormat} -\alias{CsvFileFormat} \title{Dataset file formats} \description{ A \code{FileFormat} holds information about how to read and parse the files @@ -52,7 +51,7 @@ It returns the appropriate subclass of \code{FileFormat} (e.g. \code{ParquetFile } \examples{ -\dontshow{if (arrow_with_dataset() && tolower(Sys.info()[["sysname"]]) != "windows") (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +\dontshow{if (arrow_with_dataset()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} ## Semi-colon delimited files # Set up directory for examples tf <- tempfile() diff --git a/r/man/acero.Rd b/r/man/acero.Rd index 6b5563acdf94e..6d4476c44c278 100644 --- a/r/man/acero.Rd +++ b/r/man/acero.Rd @@ -53,9 +53,9 @@ Table into an R \code{data.frame}. \item \code{\link[dplyr:slice]{slice_tail()}}: slicing within groups not supported; Arrow datasets do not have row order, so tail is non-deterministic; \code{prop} only supported on queries where \code{nrow()} is knowable without evaluating \item \code{\link[dplyr:summarise]{summarise()}}: window functions not currently supported; arguments \code{.drop = FALSE} and `.groups = "rowwise" not supported \item \code{\link[dplyr:count]{tally()}} -\item \code{\link[dplyr:mutate]{transmute()}} +\item \code{\link[dplyr:transmute]{transmute()}} \item \code{\link[dplyr:group_by]{ungroup()}} -\item \code{\link[dplyr:reexports]{union()}} +\item \code{\link[dplyr:setops]{union()}} \item \code{\link[dplyr:setops]{union_all()}} } } diff --git a/r/man/open_delim_dataset.Rd b/r/man/open_delim_dataset.Rd new file mode 100644 index 0000000000000..d127f772c6340 --- /dev/null +++ b/r/man/open_delim_dataset.Rd @@ -0,0 +1,216 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataset.R +\name{open_delim_dataset} +\alias{open_delim_dataset} +\alias{open_csv_dataset} +\alias{open_tsv_dataset} +\title{Open a multi-file dataset of CSV or other delimiter-separated format} +\usage{ +open_delim_dataset( + sources, + schema = NULL, + partitioning = hive_partition(), + hive_style = NA, + unify_schemas = NULL, + factory_options = list(), + delim = ",", + quote = "\\"", + escape_double = TRUE, + escape_backslash = FALSE, + col_names = TRUE, + col_types = NULL, + na = c("", "NA"), + skip_empty_rows = TRUE, + skip = 0L, + convert_options = NULL, + read_options = NULL, + timestamp_parsers = NULL +) + +open_csv_dataset( + sources, + schema = NULL, + partitioning = hive_partition(), + hive_style = NA, + unify_schemas = NULL, + factory_options = list(), + quote = "\\"", + escape_double = TRUE, + escape_backslash = FALSE, + col_names = TRUE, + col_types = NULL, + na = c("", "NA"), + skip_empty_rows = TRUE, + skip = 0L, + convert_options = NULL, + read_options = NULL, + timestamp_parsers = NULL +) + +open_tsv_dataset( + sources, + schema = NULL, + partitioning = hive_partition(), + hive_style = NA, + unify_schemas = NULL, + factory_options = list(), + quote = "\\"", + escape_double = TRUE, + escape_backslash = FALSE, + col_names = TRUE, + col_types = NULL, + na = c("", "NA"), + skip_empty_rows = TRUE, + skip = 0L, + convert_options = NULL, + read_options = NULL, + timestamp_parsers = NULL +) +} +\arguments{ +\item{sources}{One of: +\itemize{ +\item a string path or URI to a directory containing data files +\item a \link{FileSystem} that references a directory containing data files +(such as what is returned by \code{\link[=s3_bucket]{s3_bucket()}}) +\item a string path or URI to a single file +\item a character vector of paths or URIs to individual data files +\item a list of \code{Dataset} objects as created by this function +\item a list of \code{DatasetFactory} objects as created by \code{\link[=dataset_factory]{dataset_factory()}}. +} + +When \code{sources} is a vector of file URIs, they must all use the same protocol +and point to files located in the same file system and having the same +format.} + +\item{schema}{\link{Schema} for the \code{Dataset}. If \code{NULL} (the default), the schema +will be inferred from the data sources.} + +\item{partitioning}{When \code{sources} is a directory path/URI, one of: +\itemize{ +\item a \code{Schema}, in which case the file paths relative to \code{sources} will be +parsed, and path segments will be matched with the schema fields. +\item a character vector that defines the field names corresponding to those +path segments (that is, you're providing the names that would correspond +to a \code{Schema} but the types will be autodetected) +\item a \code{Partitioning} or \code{PartitioningFactory}, such as returned +by \code{\link[=hive_partition]{hive_partition()}} +\item \code{NULL} for no partitioning +} + +The default is to autodetect Hive-style partitions unless +\code{hive_style = FALSE}. See the "Partitioning" section for details. +When \code{sources} is not a directory path/URI, \code{partitioning} is ignored.} + +\item{hive_style}{Logical: should \code{partitioning} be interpreted as +Hive-style? Default is \code{NA}, which means to inspect the file paths for +Hive-style partitioning and behave accordingly.} + +\item{unify_schemas}{logical: should all data fragments (files, \code{Dataset}s) +be scanned in order to create a unified schema from them? If \code{FALSE}, only +the first fragment will be inspected for its schema. Use this fast path +when you know and trust that all fragments have an identical schema. +The default is \code{FALSE} when creating a dataset from a directory path/URI or +vector of file paths/URIs (because there may be many files and scanning may +be slow) but \code{TRUE} when \code{sources} is a list of \code{Dataset}s (because there +should be few \code{Dataset}s in the list and their \code{Schema}s are already in +memory).} + +\item{factory_options}{list of optional FileSystemFactoryOptions: +\itemize{ +\item \code{partition_base_dir}: string path segment prefix to ignore when +discovering partition information with DirectoryPartitioning. Not +meaningful (ignored with a warning) for HivePartitioning, nor is it +valid when providing a vector of file paths. +\item \code{exclude_invalid_files}: logical: should files that are not valid data +files be excluded? Default is \code{FALSE} because checking all files up +front incurs I/O and thus will be slower, especially on remote +filesystems. If false and there are invalid files, there will be an +error at scan time. This is the only FileSystemFactoryOption that is +valid for both when providing a directory path in which to discover +files and when providing a vector of file paths. +\item \code{selector_ignore_prefixes}: character vector of file prefixes to ignore +when discovering files in a directory. If invalid files can be excluded +by a common filename prefix this way, you can avoid the I/O cost of +\code{exclude_invalid_files}. Not valid when providing a vector of file paths +(but if you're providing the file list, you can filter invalid files +yourself). +}} + +\item{delim}{Single character used to separate fields within a record.} + +\item{quote}{Single character used to quote strings.} + +\item{escape_double}{Does the file escape quotes by doubling them? +i.e. If this option is \code{TRUE}, the value \verb{""""} represents +a single quote, \verb{\\"}.} + +\item{escape_backslash}{Does the file use backslashes to escape special +characters? This is more general than \code{escape_double} as backslashes +can be used to escape the delimiter character, the quote character, or +to add special characters like \verb{\\\\n}.} + +\item{col_names}{If \code{TRUE}, the first row of the input will be used as the +column names and will not be included in the data frame. If \code{FALSE}, column +names will be generated by Arrow, starting with "f0", "f1", ..., "fN". +Alternatively, you can specify a character vector of column names.} + +\item{col_types}{A compact string representation of the column types, +an Arrow \link{Schema}, or \code{NULL} (the default) to infer types from the data.} + +\item{na}{A character vector of strings to interpret as missing values.} + +\item{skip_empty_rows}{Should blank rows be ignored altogether? If +\code{TRUE}, blank rows will not be represented at all. If \code{FALSE}, they will be +filled with missings.} + +\item{skip}{Number of lines to skip before reading data.} + +\item{convert_options}{see \link[=CsvReadOptions]{file reader options}} + +\item{read_options}{see \link[=CsvReadOptions]{file reader options}} + +\item{timestamp_parsers}{User-defined timestamp parsers. If more than one +parser is specified, the CSV conversion logic will try parsing values +starting from the beginning of this vector. Possible values are: +\itemize{ +\item \code{NULL}: the default, which uses the ISO-8601 parser +\item a character vector of \link[base:strptime]{strptime} parse strings +\item a list of \link{TimestampParser} objects +}} +} +\description{ +A wrapper around \link{open_dataset} which explicitly includes parameters mirroring \code{\link[=read_csv_arrow]{read_csv_arrow()}}, +\code{\link[=read_delim_arrow]{read_delim_arrow()}}, and \code{\link[=read_tsv_arrow]{read_tsv_arrow()}} to allows for easy switching between functions +for opening single files and functions for opening datasets. +} +\section{Options currently supported by \code{\link[=read_delim_arrow]{read_delim_arrow()}} which are not supported here}{ + +\itemize{ +\item \code{file} (instead, please specify files in \code{sources}) +\item \code{col_select} (instead, subset columns after dataset creation) +\item \code{quoted_na} +\item \code{as_data_frame} (instead, convert to data frame after dataset creation) +\item \code{parse_options} +} +} + +\examples{ +\dontshow{if (arrow_with_dataset()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +# Set up directory for examples +tf <- tempfile() +dir.create(tf) +df <- data.frame(x = c("1", "2", "NULL")) + +file_path <- file.path(tf, "file1.txt") +write.table(df, file_path, sep = ",", row.names = FALSE) + +read_csv_arrow(file_path, na = c("", "NA", "NULL"), col_names = "y", skip = 1) +open_csv_dataset(file_path, na = c("", "NA", "NULL"), col_names = "y", skip = 1) + +unlink(tf) +\dontshow{\}) # examplesIf} +} +\seealso{ +\code{\link[=open_dataset]{open_dataset()}} +} diff --git a/r/tests/testthat/test-dataset-csv.R b/r/tests/testthat/test-dataset-csv.R index 436db985fbff6..b25c57b2ba2ff 100644 --- a/r/tests/testthat/test-dataset-csv.R +++ b/r/tests/testthat/test-dataset-csv.R @@ -218,9 +218,9 @@ test_that("readr parse options", { character(0) ) - # With not yet supported readr parse options (ARROW-8631) + # With not yet supported readr parse options expect_error( - open_dataset(tsv_dir, partitioning = "part", delim = "\t", na = "\\N"), + open_dataset(tsv_dir, partitioning = "part", delim = "\t", quoted_na = TRUE), "supported" ) @@ -476,3 +476,89 @@ test_that("CSV reading/parsing/convert options can be passed in as lists", { expect_equal(ds1, ds2) }) + +test_that("open_delim_dataset params passed through to open_dataset", { + ds <- open_delim_dataset(csv_dir, delim = ",", partitioning = "part") + expect_r6_class(ds$format, "CsvFileFormat") + expect_r6_class(ds$filesystem, "LocalFileSystem") + expect_identical(names(ds), c(names(df1), "part")) + expect_identical(dim(ds), c(20L, 7L)) + + # quote + dst_dir <- make_temp_dir() + dst_file <- file.path(dst_dir, "data.csv") + + df <- data.frame(a = c(1, 2), b = c("'abc'", "'def'")) + write.csv(df, dst_file, row.names = FALSE, quote = FALSE) + + ds_quote <- open_csv_dataset(dst_dir, quote = "'") %>% collect() + expect_equal(ds_quote$b, c("abc", "def")) + + # na + ds <- open_csv_dataset(csv_dir, partitioning = "part", na = c("", "NA", "FALSE")) %>% collect() + expect_identical(ds$lgl, c( + TRUE, NA, NA, TRUE, NA, TRUE, NA, NA, TRUE, NA, TRUE, NA, NA, + TRUE, NA, TRUE, NA, NA, TRUE, NA + )) + + # col_names and skip + ds <- open_csv_dataset( + csv_dir, + partitioning = "part", + col_names = paste0("col_", 1:6), + skip = 1 + ) %>% collect() + + expect_named(ds, c("col_1", "col_2", "col_3", "col_4", "col_5", "col_6", "part")) + expect_equal(nrow(ds), 20) + + # col_types + dst_dir <- make_temp_dir() + dst_file <- file.path(dst_dir, "data.csv") + + df <- data.frame(a = c(1, NA, 2), b = c("'abc'", NA, "'def'")) + write.csv(df, dst_file, row.names = FALSE, quote = FALSE) + + data_schema <- schema(a = string(), b = string()) + ds_strings <- open_csv_dataset(dst_dir, col_types = data_schema) + expect_equal(ds_strings$schema, schema(a = string(), b = string())) + + # skip_empty_rows + tf <- tempfile() + writeLines('"x"\n"y"\nNA\nNA\n"NULL"\n\n\n', tf) + + ds <- open_csv_dataset(tf, skip_empty_rows = FALSE) %>% collect() + expect_equal(nrow(ds), 7) + + # convert_options + ds <- open_csv_dataset( + csv_dir, + convert_options = list(null_values = c("NA", "", "FALSE"), strings_can_be_null = TRUE) + ) %>% collect() + + expect_equal( + ds$lgl, + c(TRUE, NA, NA, TRUE, NA, TRUE, NA, NA, TRUE, NA, TRUE, NA, NA, TRUE, NA, TRUE, NA, NA, TRUE, NA) + ) + + # read_options + ds <- open_csv_dataset( + csv_dir, + read_options = list(column_names = paste0("col_", 1:6)) + ) %>% collect() + + expect_named(ds, c("col_1", "col_2", "col_3", "col_4", "col_5", "col_6")) + + # timestamp_parsers + skip("GH-33708: timestamp_parsers don't appear to be working properly") + + dst_dir <- make_temp_dir() + dst_file <- file.path(dst_dir, "data.csv") + + df <- data.frame(time = "2023-01-16 19:47:57") + write.csv(df, dst_file, row.names = FALSE, quote = FALSE) + + ds <- open_csv_dataset(dst_dir, timestamp_parsers = c(TimestampParser$create(format = "%d-%m-%y"))) %>% collect() + + expect_equal(ds$time, "16-01-2023") +})