20 #include "graphar/label.h"
31 int read_parquet_file_and_get_valid_indices(
32 const char* parquet_filename,
const int row_num,
const int tot_label_num,
33 const int tested_label_num, std::vector<int> tested_label_ids,
34 const std::function<
bool(
bool*,
int)>& IsValid,
int chunk_idx,
35 int chunk_size, std::vector<int>* indices, uint64_t* bitmap,
36 const QUERY_TYPE query_type) {
38 std::unique_ptr<parquet::ParquetFileReader> parquet_reader =
39 parquet::ParquetFileReader::OpenFile(
40 parquet_filename + std::to_string(chunk_idx),
false);
43 std::shared_ptr<parquet::FileMetaData> file_metadata =
44 parquet_reader->metadata();
45 int row_group_count = file_metadata->num_row_groups();
46 int num_columns = file_metadata->num_columns();
49 std::vector<int> col_row_counts(num_columns, 0);
50 bool** value =
new bool*[num_columns];
51 for (
int i = 0; i < num_columns; i++) {
52 value[i] =
new bool[row_num];
56 for (
int rg = 0; rg < row_group_count; ++rg) {
58 std::shared_ptr<parquet::RowGroupReader> row_group_reader =
59 parquet_reader->RowGroup(rg);
61 int64_t values_read = 0;
62 int64_t rows_read = 0;
63 std::shared_ptr<parquet::ColumnReader> column_reader;
65 ARROW_UNUSED(rows_read);
68 for (
int k = 0; k < tested_label_num; k++) {
69 int col_id = tested_label_ids[k];
71 column_reader = row_group_reader->Column(col_id);
72 parquet::BoolReader* bool_reader =
73 static_cast<parquet::BoolReader*
>(column_reader.get());
75 while (bool_reader->HasNext()) {
79 rows_read = bool_reader->ReadBatch(BATCH_SIZE,
nullptr,
nullptr,
80 value[k] + col_row_counts[col_id],
84 col_row_counts[col_id] += rows_read;
88 const int kTotLabelNum = tot_label_num;
89 bool state[kTotLabelNum];
91 int offset = chunk_idx * chunk_size;
92 for (
int i = 0; i < row_num; i++) {
93 for (
int j = 0; j < tested_label_num; j++) {
94 state[j] = value[j][i];
96 if (IsValid(state, tested_label_num)) {
98 if (query_type == QUERY_TYPE::INDEX)
100 indices->push_back(i + offset);
101 else if (query_type == QUERY_TYPE::BITMAP)
102 SetBitmap(bitmap, i);
107 for (
int i = 0; i < num_columns; i++) {