Apache GraphAr C++ Library
The C++ Library for Apache GraphAr
util.h
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements. See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership. The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License. You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied. See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
20 #pragma once
21 
22 #include <memory>
23 #include <numeric>
24 #include <string>
25 #include <utility>
26 #include <vector>
27 
28 #include "graphar/result.h"
29 #include "graphar/status.h"
30 
31 #include "arrow/api.h"
32 #include "arrow/csv/api.h"
33 #include "arrow/filesystem/api.h"
34 #include "arrow/io/api.h"
35 #include "arrow/stl.h"
36 #include "arrow/util/uri.h"
37 #include "parquet/arrow/reader.h"
38 #include "parquet/arrow/writer.h"
39 #define REGULAR_SEPARATOR "_"
40 
41 // forward declarations
42 namespace arrow {
43 class Table;
44 class ChunkedArray;
45 class Array;
46 } // namespace arrow
47 
48 namespace graphar {
49 
50 template <typename T>
51 class Array final {
52  public:
53  using ValueType = T;
54  Array() : data_(nullptr), size_(0) {}
55  Array(const T* data, size_t size) : data_(data), size_(size) {}
56  Array(const Array& other) = default;
57  Array(Array&& other) = default;
58  Array& operator=(const Array& other) = default;
59  Array& operator=(Array&& other) = default;
60  ~Array() = default;
61 
62  const T& operator[](size_t index) const { return data_[index]; }
63 
64  const T* data() const { return data_; }
65 
66  size_t size() const { return size_; }
67 
68  void clear() {
69  data_ = nullptr;
70  size_ = 0;
71  }
72 
73  bool empty() const { return size_ == 0; }
74 
75  void swap(Array& other) {
76  std::swap(data_, other.data_);
77  std::swap(size_, other.size_);
78  }
79 
80  const T* begin() const { return data_; }
81 
82  const T* end() const { return data_ + size_; }
83 
84  private:
85  const T* data_;
86  size_t size_;
87 };
88 
89 template <>
90 class Array<std::string_view> final {
91  public:
92  using ValueType = std::string_view;
93 
94  class iterator {
95  private:
96  const int32_t* offsets_;
97  const uint8_t* data_;
98  size_t index_;
99 
100  public:
101  explicit iterator(const int32_t* offsets, const uint8_t* data, size_t index)
102  : offsets_(offsets), data_(data), index_(index) {}
103 
104  const std::string_view operator*() const {
105  return std::string_view(
106  reinterpret_cast<const char*>(data_ + offsets_[index_]),
107  offsets_[index_ + 1] - offsets_[index_]);
108  }
109 
110  iterator& operator++() {
111  ++index_;
112  return *this;
113  }
114 
115  iterator operator++(int) { return iterator(offsets_, data_, index_++); }
116 
117  iterator operator+(size_t n) {
118  return iterator(offsets_, data_, index_ + n);
119  }
120 
121  bool operator==(const iterator& other) const {
122  return index_ == other.index_;
123  }
124  bool operator!=(const iterator& other) const {
125  return index_ != other.index_;
126  }
127  };
128  Array() : offsets_(nullptr), data_(nullptr), size_(0) {}
129  explicit Array(const int32_t* offsets, const uint8_t* data, size_t size)
130  : offsets_(offsets), data_(data), size_(size) {}
131 
132  const std::string_view operator[](size_t index) const {
133  return std::string_view(
134  reinterpret_cast<const char*>(data_ + offsets_[index]),
135  offsets_[index + 1] - offsets_[index]);
136  }
137 
138  const int32_t* offsets() const { return offsets_; }
139  const uint8_t* data() const { return data_; }
140 
141  size_t size() const { return size_; }
142 
143  void clear() {
144  offsets_ = nullptr;
145  data_ = nullptr;
146  size_ = 0;
147  }
148 
149  bool empty() const { return size_ == 0; }
150 
151  void swap(Array& other) {
152  std::swap(offsets_, other.offsets_);
153  std::swap(data_, other.data_);
154  std::swap(size_, other.size_);
155  }
156 
157  const iterator begin() const { return iterator(offsets_, data_, 0); }
158  const iterator end() const { return iterator(offsets_, data_, size_); }
159 
160  private:
161  const int32_t* offsets_;
162  const uint8_t* data_;
163  size_t size_;
164 };
165 
166 using Int32Array = Array<int32_t>;
167 using Int64Array = Array<int64_t>;
168 using FloatArray = Array<float>;
169 using DoubleArray = Array<double>;
170 using StringArray = Array<std::string_view>;
171 
172 } // namespace graphar
173 
174 namespace graphar::util {
175 
177  explicit IndexConverter(std::vector<IdType>&& edge_chunk_nums)
178  : edge_chunk_nums_(std::move(edge_chunk_nums)) {}
179  IdType IndexPairToGlobalChunkIndex(IdType vertex_chunk_index,
180  IdType edge_chunk_index) {
181  IdType global_edge_chunk_index = 0;
182  for (IdType i = 0; i < vertex_chunk_index; ++i) {
183  global_edge_chunk_index += edge_chunk_nums_[i];
184  }
185  return global_edge_chunk_index + edge_chunk_index;
186  }
187 
188  // covert edge global chunk index to <vertex_chunk_index, edge_chunk_index>
189  std::pair<IdType, IdType> GlobalChunkIndexToIndexPair(IdType global_index) {
190  std::pair<IdType, IdType> index_pair(0, 0);
191  for (size_t i = 0; i < edge_chunk_nums_.size(); ++i) {
192  if (global_index < edge_chunk_nums_[i]) {
193  index_pair.first = static_cast<IdType>(i);
194  index_pair.second = global_index;
195  break;
196  }
197  global_index -= edge_chunk_nums_[i];
198  }
199  return index_pair;
200  }
201 
202  private:
203  std::vector<IdType> edge_chunk_nums_;
204 };
205 
206 static inline IdType IndexPairToGlobalChunkIndex(
207  const std::vector<IdType>& edge_chunk_nums, IdType vertex_chunk_index,
208  IdType edge_chunk_index) {
209  IdType global_edge_chunk_index = 0;
210  for (IdType i = 0; i < vertex_chunk_index; ++i) {
211  global_edge_chunk_index += edge_chunk_nums[i];
212  }
213  return global_edge_chunk_index + edge_chunk_index;
214 }
215 
216 // covert edge global chunk index to <vertex_chunk_index, edge_chunk_index>
217 static inline std::pair<IdType, IdType> GlobalChunkIndexToIndexPair(
218  const std::vector<IdType>& edge_chunk_nums, IdType global_index) {
219  std::pair<IdType, IdType> index_pair(0, 0);
220  for (size_t i = 0; i < edge_chunk_nums.size(); ++i) {
221  if (global_index < edge_chunk_nums[i]) {
222  index_pair.first = static_cast<IdType>(i);
223  index_pair.second = global_index;
224  break;
225  }
226  global_index -= edge_chunk_nums[i];
227  }
228  return index_pair;
229 }
230 
231 std::shared_ptr<arrow::ChunkedArray> GetArrowColumnByName(
232  std::shared_ptr<arrow::Table> const& table, const std::string& name);
233 
234 std::shared_ptr<arrow::Array> GetArrowArrayByChunkIndex(
235  std::shared_ptr<arrow::ChunkedArray> const& chunk_array,
236  int64_t chunk_index);
237 
238 Result<const void*> GetArrowArrayData(
239  std::shared_ptr<arrow::Array> const& array);
240 
241 static inline std::string ConcatStringWithDelimiter(
242  const std::vector<std::string>& str_vec, const std::string& delimiter) {
243  return std::accumulate(
244  std::begin(str_vec), std::end(str_vec), std::string(),
245  [&delimiter](const std::string& ss, const std::string& s) {
246  return ss.empty() ? s : ss + delimiter + s;
247  });
248 }
249 
250 template <typename T>
251 struct ValueGetter {
252  inline static T Value(const void* data, int64_t offset) {
253  return reinterpret_cast<const T*>(data)[offset];
254  }
255 };
256 
257 template <>
258 struct ValueGetter<std::string> {
259  static std::string Value(const void* data, int64_t offset);
260 };
261 
262 static inline arrow::Status OpenParquetArrowReader(
263  const std::string& file_path, arrow::MemoryPool* pool,
264  std::unique_ptr<parquet::arrow::FileReader>* parquet_reader) {
265  std::shared_ptr<arrow::io::RandomAccessFile> input;
266  ARROW_ASSIGN_OR_RAISE(input, arrow::io::ReadableFile::Open(file_path));
267 #if defined(ARROW_VERSION) && ARROW_VERSION <= 20000000
268  ARROW_RETURN_NOT_OK(parquet::arrow::OpenFile(input, pool, parquet_reader));
269 #else
270  ARROW_ASSIGN_OR_RAISE(auto reader, parquet::arrow::OpenFile(input, pool));
271  *parquet_reader = std::move(reader);
272 #endif
273  return arrow::Status::OK();
274 }
275 
276 } // namespace graphar::util