21 #include <parquet/properties.h>
26 #include <unordered_map>
30 #include "arrow/adapters/orc/adapter.h"
32 #include "arrow/api.h"
33 #include "arrow/csv/api.h"
34 #include "arrow/dataset/api.h"
35 #include "arrow/filesystem/api.h"
36 #include "parquet/arrow/writer.h"
69 arrow::io::IOContext io_context;
70 std::string null_string;
71 std::string eol =
"\n";
72 bool include_header =
true;
74 int32_t batch_size = 1024;
75 arrow::csv::QuotingStyle quoting_style = arrow::csv::QuotingStyle::Needed;
86 std::shared_ptr<::parquet::FileEncryptionProperties> encryption_properties;
87 std::unordered_map<std::string, ::parquet::Encoding::type> column_encoding;
88 std::unordered_map<std::string, arrow::Compression::type>
90 std::unordered_map<std::string, int> column_compression_level;
91 std::unordered_map<std::string, size_t> column_max_statistics_size;
92 std::unordered_map<std::string, bool> column_statistics;
93 std::unordered_map<std::string, bool> column_write_page_index;
94 std::vector<::parquet::SortingColumn> sorting_columns;
95 int64_t dictionary_pagesize_limit = 1024 * 1024;
96 int64_t write_batch_size = 1024;
97 int64_t max_row_group_length = 1024 * 1024;
98 int64_t data_pagesize = 1024 * 1024;
99 size_t max_statistics_size = 4096;
100 int compression_level = std::numeric_limits<int>::min();
101 ::parquet::ParquetDataPageVersion data_page_version =
102 ::parquet::ParquetDataPageVersion::V1;
103 ::parquet::ParquetVersion::type version =
104 ::parquet::ParquetVersion::PARQUET_2_6;
105 ::parquet::Encoding::type encoding = ::parquet::Encoding::PLAIN;
106 arrow::Compression::type compression = arrow::Compression::ZSTD;
107 ::arrow::TimeUnit::type coerce_timestamps = ::arrow::TimeUnit::MICRO;
108 ::arrow::internal::Executor* executor =
nullptr;
109 bool enable_dictionary =
true;
110 bool enable_statistics =
true;
111 bool enable_store_decimal_as_integer =
false;
112 bool enable_write_page_index =
false;
113 bool compliant_nested_types =
true;
114 bool use_threads =
false;
115 bool enable_deprecated_int96_timestamps =
false;
116 bool allow_truncated_timestamps =
false;
117 bool store_schema =
false;
128 std::vector<int64_t> bloom_filter_columns;
129 arrow::adapters::orc::FileVersion file_version =
130 arrow::adapters::orc::FileVersion(0, 12);
131 arrow::adapters::orc::CompressionStrategy compression_strategy =
132 arrow::adapters::orc::CompressionStrategy::kSpeed;
133 arrow::Compression::type compression = arrow::Compression::UNCOMPRESSED;
134 int64_t stripe_size = 64 * 1024 * 1024;
135 int64_t batch_size = 1024;
136 int64_t compression_block_size = 64 * 1024;
137 int64_t row_index_stride = 10000;
138 double padding_tolerance = 0.0;
139 double dictionary_key_size_threshold = 0.0;
140 double bloom_filter_fpp = 0.05;
150 : writerOptions_(wopt),
151 option_(wopt && wopt->csvOption_ ? wopt->csvOption_
152 : std::make_shared<CSVOption>()) {}
154 option_->include_header = header;
158 option_->batch_size = bs;
162 option_->delimiter = d;
166 option_->null_string = ns;
170 option_->io_context = ctx;
178 option_->quoting_style = qs;
181 std::shared_ptr<WriterOptions> build() {
182 if (!writerOptions_) {
183 writerOptions_ = std::make_shared<WriterOptions>();
185 writerOptions_->setCsvOption(option_);
186 return writerOptions_;
190 std::shared_ptr<WriterOptions> writerOptions_;
191 std::shared_ptr<CSVOption> option_;
199 : writerOptions_(wopt),
200 option_(wopt && wopt->parquetOption_
201 ? wopt->parquetOption_
202 : std::make_shared<ParquetOption>()) {}
204 option_->enable_dictionary = enable;
208 option_->dictionary_pagesize_limit = limit;
212 option_->write_batch_size = batch_size;
216 option_->max_row_group_length = length;
220 option_->data_pagesize = pagesize;
224 ::parquet::ParquetDataPageVersion version) {
225 option_->data_page_version = version;
229 option_->version = ver;
233 option_->encoding = enc;
237 const std::unordered_map<std::string, ::parquet::Encoding::type>&
239 option_->column_encoding = encodings;
243 option_->compression = comp;
247 const std::unordered_map<std::string, arrow::Compression::type>&
249 option_->column_compression = compressions;
253 option_->compression_level = level;
257 const std::unordered_map<std::string, int>& levels) {
258 option_->column_compression_level = levels;
262 option_->max_statistics_size = size;
266 const std::unordered_map<std::string, size_t>& sizes) {
267 option_->column_max_statistics_size = sizes;
271 const std::shared_ptr<::parquet::FileEncryptionProperties>& props) {
272 option_->encryption_properties = props;
276 option_->enable_statistics = enable;
280 const std::unordered_map<std::string, bool>& stats) {
281 option_->column_statistics = stats;
285 const std::vector<::parquet::SortingColumn>& columns) {
286 option_->sorting_columns = columns;
290 option_->enable_store_decimal_as_integer = enable;
294 option_->enable_write_page_index = enable;
298 const std::unordered_map<std::string, bool>& indices) {
299 option_->column_write_page_index = indices;
303 option_->compliant_nested_types = compliant;
307 option_->use_threads = use;
311 option_->enable_deprecated_int96_timestamps = enable;
315 option_->coerce_timestamps = unit;
319 option_->allow_truncated_timestamps = allow;
323 option_->store_schema = store;
327 option_->executor = exec;
330 std::shared_ptr<WriterOptions> build() {
331 if (!writerOptions_) {
332 writerOptions_ = std::make_shared<WriterOptions>();
334 writerOptions_->setParquetOption(option_);
335 return writerOptions_;
339 std::shared_ptr<WriterOptions> writerOptions_;
340 std::shared_ptr<ParquetOption> option_;
348 : writerOptions_(wopt),
349 option_(wopt && wopt->orcOption_ ? wopt->orcOption_
350 : std::make_shared<ORCOption>()) {}
353 option_->batch_size = bs;
357 option_->file_version = fv;
361 option_->stripe_size = ss;
365 option_->compression = comp;
369 option_->compression_block_size = cbs;
373 arrow::adapters::orc::CompressionStrategy cs) {
374 option_->compression_strategy = cs;
378 option_->row_index_stride = ris;
382 option_->padding_tolerance = pt;
386 option_->dictionary_key_size_threshold = dkst;
390 option_->bloom_filter_columns.push_back(bfc);
394 option_->bloom_filter_fpp = bffpp;
398 std::shared_ptr<WriterOptions> build() {
399 if (!writerOptions_) {
400 writerOptions_ = std::make_shared<WriterOptions>();
402 writerOptions_->setOrcOption(option_);
403 return writerOptions_;
407 std::shared_ptr<WriterOptions> writerOptions_;
408 std::shared_ptr<ORCOption> option_;
413 std::shared_ptr<ParquetOption> parquet,
414 std::shared_ptr<ORCOption> orc)
415 : csvOption_(csv), parquetOption_(parquet), orcOption_(orc) {}
416 static std::shared_ptr<WriterOptions> DefaultWriterOption() {
417 return std::make_shared<WriterOptions>();
419 void setCsvOption(std::shared_ptr<CSVOption> csv_option) {
420 csvOption_ = csv_option;
422 void setParquetOption(std::shared_ptr<ParquetOption> parquet_option) {
423 parquetOption_ = parquet_option;
425 void setOrcOption(std::shared_ptr<ORCOption> orc_option) {
426 orcOption_ = orc_option;
428 arrow::csv::WriteOptions getCsvOption()
const;
429 std::shared_ptr<parquet::WriterProperties> getParquetWriterProperties()
const;
430 std::shared_ptr<parquet::ArrowWriterProperties> getArrowWriterProperties()
433 arrow::adapters::orc::WriteOptions getOrcOption()
const;
437 std::shared_ptr<CSVOption> csvOption_;
438 std::shared_ptr<ParquetOption> parquetOption_;
439 std::shared_ptr<ORCOption> orcOption_;
445 enum class ValidateLevel : char {
447 default_validate = 0,
Provides configuration options for different file format writers (CSV, Parquet, ORC) in GraphAr....