28 #include "graphar/result.h"
29 #include "graphar/status.h"
31 #include "arrow/api.h"
32 #include "arrow/csv/api.h"
33 #include "arrow/filesystem/api.h"
34 #include "arrow/io/api.h"
35 #include "arrow/stl.h"
36 #include "arrow/util/uri.h"
37 #include "parquet/arrow/reader.h"
38 #include "parquet/arrow/writer.h"
39 #define REGULAR_SEPARATOR "_"
54 Array() : data_(
nullptr), size_(0) {}
55 Array(
const T* data,
size_t size) : data_(data), size_(size) {}
58 Array& operator=(
const Array& other) =
default;
62 const T& operator[](
size_t index)
const {
return data_[index]; }
64 const T* data()
const {
return data_; }
66 size_t size()
const {
return size_; }
73 bool empty()
const {
return size_ == 0; }
75 void swap(
Array& other) {
76 std::swap(data_, other.data_);
77 std::swap(size_, other.size_);
80 const T* begin()
const {
return data_; }
82 const T* end()
const {
return data_ + size_; }
90 class Array<std::string_view> final {
92 using ValueType = std::string_view;
96 const int32_t* offsets_;
101 explicit iterator(
const int32_t* offsets,
const uint8_t* data,
size_t index)
102 : offsets_(offsets), data_(data), index_(index) {}
104 const std::string_view operator*()
const {
105 return std::string_view(
106 reinterpret_cast<const char*
>(data_ + offsets_[index_]),
107 offsets_[index_ + 1] - offsets_[index_]);
110 iterator& operator++() {
115 iterator operator++(
int) {
return iterator(offsets_, data_, index_++); }
117 iterator operator+(
size_t n) {
118 return iterator(offsets_, data_, index_ + n);
121 bool operator==(
const iterator& other)
const {
122 return index_ == other.index_;
124 bool operator!=(
const iterator& other)
const {
125 return index_ != other.index_;
128 Array() : offsets_(nullptr), data_(nullptr), size_(0) {}
129 explicit Array(
const int32_t* offsets,
const uint8_t* data,
size_t size)
130 : offsets_(offsets), data_(data), size_(size) {}
132 const std::string_view operator[](
size_t index)
const {
133 return std::string_view(
134 reinterpret_cast<const char*
>(data_ + offsets_[index]),
135 offsets_[index + 1] - offsets_[index]);
138 const int32_t* offsets()
const {
return offsets_; }
139 const uint8_t* data()
const {
return data_; }
141 size_t size()
const {
return size_; }
149 bool empty()
const {
return size_ == 0; }
151 void swap(Array& other) {
152 std::swap(offsets_, other.offsets_);
153 std::swap(data_, other.data_);
154 std::swap(size_, other.size_);
157 const iterator begin()
const {
return iterator(offsets_, data_, 0); }
158 const iterator end()
const {
return iterator(offsets_, data_, size_); }
161 const int32_t* offsets_;
162 const uint8_t* data_;
166 using Int32Array = Array<int32_t>;
167 using Int64Array = Array<int64_t>;
168 using FloatArray = Array<float>;
169 using DoubleArray = Array<double>;
170 using StringArray = Array<std::string_view>;
174 namespace graphar::util {
178 : edge_chunk_nums_(std::move(edge_chunk_nums)) {}
179 IdType IndexPairToGlobalChunkIndex(IdType vertex_chunk_index,
180 IdType edge_chunk_index) {
181 IdType global_edge_chunk_index = 0;
182 for (IdType i = 0; i < vertex_chunk_index; ++i) {
183 global_edge_chunk_index += edge_chunk_nums_[i];
185 return global_edge_chunk_index + edge_chunk_index;
189 std::pair<IdType, IdType> GlobalChunkIndexToIndexPair(IdType global_index) {
190 std::pair<IdType, IdType> index_pair(0, 0);
191 for (
size_t i = 0; i < edge_chunk_nums_.size(); ++i) {
192 if (global_index < edge_chunk_nums_[i]) {
193 index_pair.first =
static_cast<IdType
>(i);
194 index_pair.second = global_index;
197 global_index -= edge_chunk_nums_[i];
203 std::vector<IdType> edge_chunk_nums_;
206 static inline IdType IndexPairToGlobalChunkIndex(
207 const std::vector<IdType>& edge_chunk_nums, IdType vertex_chunk_index,
208 IdType edge_chunk_index) {
209 IdType global_edge_chunk_index = 0;
210 for (IdType i = 0; i < vertex_chunk_index; ++i) {
211 global_edge_chunk_index += edge_chunk_nums[i];
213 return global_edge_chunk_index + edge_chunk_index;
217 static inline std::pair<IdType, IdType> GlobalChunkIndexToIndexPair(
218 const std::vector<IdType>& edge_chunk_nums, IdType global_index) {
219 std::pair<IdType, IdType> index_pair(0, 0);
220 for (
size_t i = 0; i < edge_chunk_nums.size(); ++i) {
221 if (global_index < edge_chunk_nums[i]) {
222 index_pair.first =
static_cast<IdType
>(i);
223 index_pair.second = global_index;
226 global_index -= edge_chunk_nums[i];
231 std::shared_ptr<arrow::ChunkedArray> GetArrowColumnByName(
232 std::shared_ptr<arrow::Table>
const& table,
const std::string& name);
234 std::shared_ptr<arrow::Array> GetArrowArrayByChunkIndex(
235 std::shared_ptr<arrow::ChunkedArray>
const& chunk_array,
236 int64_t chunk_index);
238 Result<const void*> GetArrowArrayData(
239 std::shared_ptr<arrow::Array>
const& array);
241 static inline std::string ConcatStringWithDelimiter(
242 const std::vector<std::string>& str_vec,
const std::string& delimiter) {
243 return std::accumulate(
244 std::begin(str_vec), std::end(str_vec), std::string(),
245 [&delimiter](
const std::string& ss,
const std::string& s) {
246 return ss.empty() ? s : ss + delimiter + s;
250 template <
typename T>
252 inline static T Value(
const void* data, int64_t offset) {
253 return reinterpret_cast<const T*
>(data)[offset];
259 static std::string Value(
const void* data, int64_t offset);
262 static inline arrow::Status OpenParquetArrowReader(
263 const std::string& file_path, arrow::MemoryPool* pool,
264 std::unique_ptr<parquet::arrow::FileReader>* parquet_reader) {
265 std::shared_ptr<arrow::io::RandomAccessFile> input;
266 ARROW_ASSIGN_OR_RAISE(input, arrow::io::ReadableFile::Open(file_path));
267 #if defined(ARROW_VERSION) && ARROW_VERSION <= 20000000
268 ARROW_RETURN_NOT_OK(parquet::arrow::OpenFile(input, pool, parquet_reader));
270 ARROW_ASSIGN_OR_RAISE(
auto reader, parquet::arrow::OpenFile(input, pool));
271 *parquet_reader = std::move(reader);
273 return arrow::Status::OK();