Apache GraphAr C++ Library
The C++ Library for Apache GraphAr
label.cc
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License. You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied. See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
20 #include "graphar/label.h"
21 
22 #include <cassert>
23 #include <cstring>
24 #include <fstream>
25 #include <iostream>
26 #include <memory>
27 #include <set>
28 
31 int read_parquet_file_and_get_valid_indices(
32  const char* parquet_filename, const int row_num, const int tot_label_num,
33  const int tested_label_num, std::vector<int> tested_label_ids,
34  const std::function<bool(bool*, int)>& IsValid, int chunk_idx,
35  int chunk_size, std::vector<int>* indices, uint64_t* bitmap,
36  const QUERY_TYPE query_type) {
37  // Create a ParquetReader instance
38  std::unique_ptr<parquet::ParquetFileReader> parquet_reader =
39  parquet::ParquetFileReader::OpenFile(
40  parquet_filename + std::to_string(chunk_idx), false);
41 
42  // Get the File MetaData
43  std::shared_ptr<parquet::FileMetaData> file_metadata =
44  parquet_reader->metadata();
45  int row_group_count = file_metadata->num_row_groups();
46  int num_columns = file_metadata->num_columns();
47 
48  // Initialize the column row counts
49  std::vector<int> col_row_counts(num_columns, 0);
50  bool** value = new bool*[num_columns];
51  for (int i = 0; i < num_columns; i++) {
52  value[i] = new bool[row_num];
53  }
54 
55  // Iterate over all the RowGroups in the file
56  for (int rg = 0; rg < row_group_count; ++rg) {
57  // Get the RowGroup Reader
58  std::shared_ptr<parquet::RowGroupReader> row_group_reader =
59  parquet_reader->RowGroup(rg);
60 
61  int64_t values_read = 0;
62  int64_t rows_read = 0;
63  std::shared_ptr<parquet::ColumnReader> column_reader;
64 
65  ARROW_UNUSED(rows_read); // prevent warning in release build
66 
67  // Read the label columns
68  for (int k = 0; k < tested_label_num; k++) {
69  int col_id = tested_label_ids[k];
70  // Get the Column Reader for the Bool column
71  column_reader = row_group_reader->Column(col_id);
72  parquet::BoolReader* bool_reader =
73  static_cast<parquet::BoolReader*>(column_reader.get());
74  // Read all the rows in the column
75  while (bool_reader->HasNext()) {
76  // Read BATCH_SIZE values at a time. The number of rows read is
77  // returned. values_read contains the number of non-null rows
78 
79  rows_read = bool_reader->ReadBatch(BATCH_SIZE, nullptr, nullptr,
80  value[k] + col_row_counts[col_id],
81  &values_read);
82 
83  // There are no NULL values in the rows written
84  col_row_counts[col_id] += rows_read;
85  }
86  }
87  }
88  const int kTotLabelNum = tot_label_num;
89  bool state[kTotLabelNum];
90  int count = 0;
91  int offset = chunk_idx * chunk_size;
92  for (int i = 0; i < row_num; i++) {
93  for (int j = 0; j < tested_label_num; j++) {
94  state[j] = value[j][i];
95  }
96  if (IsValid(state, tested_label_num)) {
97  count++;
98  if (query_type == QUERY_TYPE::INDEX)
99 
100  indices->push_back(i + offset);
101  else if (query_type == QUERY_TYPE::BITMAP)
102  SetBitmap(bitmap, i);
103  }
104  }
105 
106  // destroy the allocated space
107  for (int i = 0; i < num_columns; i++) {
108  delete[] value[i];
109  }
110  delete[] value;
111 
112  return count;
113 }