赞
踩
std::shared_ptr<DataType> list(const std::shared_ptr<DataType>& value_type) {
return std::make_shared<ListType>(value_type);
}
arrow/cpp/src/arrow/type.h
/// \brief Concrete type class for list data /// /// List data is nested data where each value is a variable number of /// child items. Lists can be recursively nested, for example /// list(list(int32)). class ARROW_EXPORT ListType : public BaseListType { public: static constexpr Type::type type_id = Type::LIST; using offset_type = int32_t; static constexpr const char* type_name() { return "list"; } // List can contain any other logical value type explicit ListType(const std::shared_ptr<DataType>& value_type) : ListType(std::make_shared<Field>("item", value_type)) {} explicit ListType(const std::shared_ptr<Field>& value_field) : BaseListType(type_id) { children_ = {value_field}; } DataTypeLayout layout() const override { return DataTypeLayout( {DataTypeLayout::Bitmap(), DataTypeLayout::FixedWidth(sizeof(offset_type))}); } std::string ToString() const override; std::string name() const override { return "list"; } protected: std::string ComputeFingerprint() const override; };
static constexpr Type::type type_id = Type::LIST;
/// \brief Main data type enumeration
///
/// This enumeration provides a quick way to interrogate the category
/// of a DataType instance.
enum type {
/// A list of some logical data type
LIST,
}
arrow/cpp/src/arrow/type.cc
list<item: list<item: int8>>
在这里实现了递归
std::string ListType::ToString() const {
std::stringstream s;
s << "list<" << value_field()->ToString() << ">";
return s.str();
}
arrow/cpp/src/arrow/type.h
/// \brief Base class for all variable-size list data types
class ARROW_EXPORT BaseListType : public NestedType {
public:
using NestedType::NestedType;
std::shared_ptr<Field> value_field() const { return children_[0]; }
std::shared_ptr<DataType> value_type() const { return children_[0]->type(); }
};
arrow/cpp/src/arrow/type.h
“item”
// List can contain any other logical value type
explicit ListType(const std::shared_ptr<DataType>& value_type)
: ListType(std::make_shared<Field>("item", value_type)) {}
arrow/cpp/src/arrow/array/builder_nested.h
// ---------------------------------------------------------------------- // List builder template <typename TYPE> class BaseListBuilder : public ArrayBuilder { public: using TypeClass = TYPE; using offset_type = typename TypeClass::offset_type; /// Use this constructor to incrementally build the value array along with offsets and /// null bitmap. BaseListBuilder(MemoryPool* pool, std::shared_ptr<ArrayBuilder> const& value_builder, const std::shared_ptr<DataType>& type) : ArrayBuilder(pool), offsets_builder_(pool), value_builder_(value_builder), value_field_(type->field(0)->WithType(NULLPTR)) {} BaseListBuilder(MemoryPool* pool, std::shared_ptr<ArrayBuilder> const& value_builder) : BaseListBuilder(pool, value_builder, list(value_builder->type())) {} Status Resize(int64_t capacity) override { if (capacity > maximum_elements()) { return Status::CapacityError("List array cannot reserve space for more than ", maximum_elements(), " got ", capacity); } ARROW_RETURN_NOT_OK(CheckCapacity(capacity)); // One more than requested for offsets ARROW_RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1)); return ArrayBuilder::Resize(capacity); } void Reset() override { ArrayBuilder::Reset(); offsets_builder_.Reset(); value_builder_->Reset(); } /// \brief Vector append /// /// If passed, valid_bytes is of equal length to values, and any zero byte /// will be considered as a null for that slot Status AppendValues(const offset_type* offsets, int64_t length, const uint8_t* valid_bytes = NULLPTR) { ARROW_RETURN_NOT_OK(Reserve(length)); UnsafeAppendToBitmap(valid_bytes, length); offsets_builder_.UnsafeAppend(offsets, length); return Status::OK(); } /// \brief Start a new variable-length list slot /// /// This function should be called before beginning to append elements to the /// value builder Status Append(bool is_valid = true) { ARROW_RETURN_NOT_OK(Reserve(1)); UnsafeAppendToBitmap(is_valid); return AppendNextOffset(); } Status AppendNull() final { return Append(false); } Status AppendNulls(int64_t length) final { ARROW_RETURN_NOT_OK(Reserve(length)); ARROW_RETURN_NOT_OK(ValidateOverflow(0)); UnsafeAppendToBitmap(length, false); const int64_t num_values = value_builder_->length(); for (int64_t i = 0; i < length; ++i) { offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_values)); } return Status::OK(); } Status AppendEmptyValue() final { return Append(true); } Status AppendEmptyValues(int64_t length) final { ARROW_RETURN_NOT_OK(Reserve(length)); ARROW_RETURN_NOT_OK(ValidateOverflow(0)); UnsafeAppendToBitmap(length, true); const int64_t num_values = value_builder_->length(); for (int64_t i = 0; i < length; ++i) { offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_values)); } return Status::OK(); } Status FinishInternal(std::shared_ptr<ArrayData>* out) override { ARROW_RETURN_NOT_OK(AppendNextOffset()); // Offset padding zeroed by BufferBuilder std::shared_ptr<Buffer> offsets, null_bitmap; ARROW_RETURN_NOT_OK(offsets_builder_.Finish(&offsets)); ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); if (value_builder_->length() == 0) { // Try to make sure we get a non-null values buffer (ARROW-2744) ARROW_RETURN_NOT_OK(value_builder_->Resize(0)); } std::shared_ptr<ArrayData> items; ARROW_RETURN_NOT_OK(value_builder_->FinishInternal(&items)); *out = ArrayData::Make(type(), length_, {null_bitmap, offsets}, {std::move(items)}, null_count_); Reset(); return Status::OK(); } Status ValidateOverflow(int64_t new_elements) const { auto new_length = value_builder_->length() + new_elements; if (ARROW_PREDICT_FALSE(new_length > maximum_elements())) { return Status::CapacityError("List array cannot contain more than ", maximum_elements(), " elements, have ", new_elements); } else { return Status::OK(); } } ArrayBuilder* value_builder() const { return value_builder_.get(); } // Cannot make this a static attribute because of linking issues static constexpr int64_t maximum_elements() { return std::numeric_limits<offset_type>::max() - 1; } std::shared_ptr<DataType> type() const override { return std::make_shared<TYPE>(value_field_->WithType(value_builder_->type())); } protected: TypedBufferBuilder<offset_type> offsets_builder_; std::shared_ptr<ArrayBuilder> value_builder_; std::shared_ptr<Field> value_field_; Status AppendNextOffset() { ARROW_RETURN_NOT_OK(ValidateOverflow(0)); const int64_t num_values = value_builder_->length(); return offsets_builder_.Append(static_cast<offset_type>(num_values)); } };
class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder { public: using TypeClass = Decimal128Type; explicit Decimal128Builder(const std::shared_ptr<DataType>& type, MemoryPool* pool = default_memory_pool()); using FixedSizeBinaryBuilder::Append; using FixedSizeBinaryBuilder::AppendValues; using FixedSizeBinaryBuilder::Reset; Status Append(Decimal128 val); void UnsafeAppend(Decimal128 val); void UnsafeAppend(util::string_view val); Status FinishInternal(std::shared_ptr<ArrayData>* out) override; /// \cond FALSE using ArrayBuilder::Finish; /// \endcond Status Finish(std::shared_ptr<Decimal128Array>* out) { return FinishTyped(out); } std::shared_ptr<DataType> type() const override { return decimal_type_; } protected: std::shared_ptr<Decimal128Type> decimal_type_; };
class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { public: using TypeClass = FixedSizeBinaryType; explicit FixedSizeBinaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool = default_memory_pool()); Status Append(const uint8_t* value) { ARROW_RETURN_NOT_OK(Reserve(1)); UnsafeAppend(value); return Status::OK(); } Status Append(const char* value) { return Append(reinterpret_cast<const uint8_t*>(value)); } Status Append(const util::string_view& view) { ARROW_RETURN_NOT_OK(Reserve(1)); UnsafeAppend(view); return Status::OK(); } Status Append(const std::string& s) { ARROW_RETURN_NOT_OK(Reserve(1)); UnsafeAppend(s); return Status::OK(); } template <size_t NBYTES> Status Append(const std::array<uint8_t, NBYTES>& value) { ARROW_RETURN_NOT_OK(Reserve(1)); UnsafeAppend( util::string_view(reinterpret_cast<const char*>(value.data()), value.size())); return Status::OK(); } Status AppendValues(const uint8_t* data, int64_t length, const uint8_t* valid_bytes = NULLPTR); Status AppendNull() final; Status AppendNulls(int64_t length) final; Status AppendEmptyValue() final; Status AppendEmptyValues(int64_t length) final; void UnsafeAppend(const uint8_t* value) { UnsafeAppendToBitmap(true); if (ARROW_PREDICT_TRUE(byte_width_ > 0)) { byte_builder_.UnsafeAppend(value, byte_width_); } } void UnsafeAppend(const char* value) { UnsafeAppend(reinterpret_cast<const uint8_t*>(value)); } void UnsafeAppend(util::string_view value) { ... ...
// ---------------------------------------------------------------------- // Binary and String template <typename TYPE> class BaseBinaryBuilder : public ArrayBuilder { public: using TypeClass = TYPE; using offset_type = typename TypeClass::offset_type; explicit BaseBinaryBuilder(MemoryPool* pool = default_memory_pool()) : ArrayBuilder(pool), offsets_builder_(pool), value_data_builder_(pool) {} BaseBinaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool) : BaseBinaryBuilder(pool) {} Status Append(const uint8_t* value, offset_type length) { ARROW_RETURN_NOT_OK(Reserve(1)); ARROW_RETURN_NOT_OK(AppendNextOffset()); // Safety check for UBSAN. if (ARROW_PREDICT_TRUE(length > 0)) { ARROW_RETURN_NOT_OK(ValidateOverflow(length)); ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length)); } UnsafeAppendToBitmap(true); return Status::OK(); } Status Append(const char* value, offset_type length) { return Append(reinterpret_cast<const uint8_t*>(value), length); } Status Append(util::string_view value) { return Append(value.data(), static_cast<offset_type>(value.size())); } Status AppendNulls(int64_t length) final { const int64_t num_bytes = value_data_builder_.length(); ARROW_RETURN_NOT_OK(Reserve(length)); for (int64_t i = 0; i < length; ++i) { offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes)); } UnsafeAppendToBitmap(length, false); return Status::OK(); } Status AppendNull() final { ARROW_RETURN_NOT_OK(AppendNextOffset()); ARROW_RETURN_NOT_OK(Reserve(1)); UnsafeAppendToBitmap(false); return Status::OK(); } Status AppendEmptyValue() final { ARROW_RETURN_NOT_OK(AppendNextOffset()); ARROW_RETURN_NOT_OK(Reserve(1)); UnsafeAppendToBitmap(true); return Status::OK(); } Status AppendEmptyValues(int64_t length) final { const int64_t num_bytes = value_data_builder_.length(); ARROW_RETURN_NOT_OK(Reserve(length)); for (int64_t i = 0; i < length; ++i) { offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes)); } UnsafeAppendToBitmap(length, true); return Status::OK(); } /// \brief Append without checking capacity /// /// Offsets and data should have been presized using Reserve() and /// ReserveData(), respectively. void UnsafeAppend(const uint8_t* value, offset_type length) { UnsafeAppendNextOffset(); value_data_builder_.UnsafeAppend(value, length); UnsafeAppendToBitmap(true); } void UnsafeAppend(const char* value, offset_type length) { UnsafeAppend(reinterpret_cast<const uint8_t*>(value), length); } void UnsafeAppend(const std::string& value) { UnsafeAppend(value.c_str(), static_cast<offset_type>(value.size())); } void UnsafeAppend(util::string_view value) { UnsafeAppend(value.data(), static_cast<offset_type>(value.size())); } void UnsafeAppendNull() { const int64_t num_bytes = value_data_builder_.length(); offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes)); UnsafeAppendToBitmap(false); } /// \brief Append a sequence of strings in one shot. /// /// \param[in] values a vector of strings /// \param[in] valid_bytes an optional sequence of bytes where non-zero /// indicates a valid (non-null) value /// \return Status Status AppendValues(const std::vector<std::string>& values, const uint8_t* valid_bytes = NULLPTR) { std::size_t total_length = std::accumulate( values.begin(), values.end(), 0ULL, [](uint64_t sum, const std::string& str) { return sum + str.size(); }); ARROW_RETURN_NOT_OK(Reserve(values.size())); ARROW_RETURN_NOT_OK(value_data_builder_.Reserve(total_length)); ARROW_RETURN_NOT_OK(offsets_builder_.Reserve(values.size())); if (valid_bytes != NULLPTR) { for (std::size_t i = 0; i < values.size(); ++i) { UnsafeAppendNextOffset(); if (valid_bytes[i]) { value_data_builder_.UnsafeAppend( reinterpret_cast<const uint8_t*>(values[i].data()), values[i].size()); } } } else { for (std::size_t i = 0; i < values.size(); ++i) { UnsafeAppendNextOffset(); value_data_builder_.UnsafeAppend( reinterpret_cast<const uint8_t*>(values[i].data()), values[i].size()); } } UnsafeAppendToBitmap(valid_bytes, values.size()); return Status::OK(); } /// \brief Append a sequence of nul-terminated strings in one shot. /// If one of the values is NULL, it is processed as a null /// value even if the corresponding valid_bytes entry is 1. /// /// \param[in] values a contiguous C array of nul-terminated char * /// \param[in] length the number of values to append /// \param[in] valid_bytes an optional sequence of bytes where non-zero /// indicates a valid (non-null) value /// \return Status Status AppendValues(const char** values, int64_t length, const uint8_t* valid_bytes = NULLPTR) { std::size_t total_length = 0; std::vector<std::size_t> value_lengths(length); bool have_null_value = false; for (int64_t i = 0; i < length; ++i) { if (values[i] != NULLPTR) { auto value_length = strlen(values[i]); value_lengths[i] = value_length; total_length += value_length; } else { have_null_value = true; } } ARROW_RETURN_NOT_OK(Reserve(length)); ARROW_RETURN_NOT_OK(ReserveData(total_length)); if (valid_bytes) { int64_t valid_bytes_offset = 0; for (int64_t i = 0; i < length; ++i) { UnsafeAppendNextOffset(); if (valid_bytes[i]) { if (values[i]) { value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]), value_lengths[i]); } else { UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, i - valid_bytes_offset); UnsafeAppendToBitmap(false); valid_bytes_offset = i + 1; } } } UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, length - valid_bytes_offset); } else { if (have_null_value) { std::vector<uint8_t> valid_vector(length, 0); for (int64_t i = 0; i < length; ++i) { UnsafeAppendNextOffset(); if (values[i]) { value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]), value_lengths[i]); valid_vector[i] = 1; } } UnsafeAppendToBitmap(valid_vector.data(), length); } else { for (int64_t i = 0; i < length; ++i) { UnsafeAppendNextOffset(); value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]), value_lengths[i]); } UnsafeAppendToBitmap(NULLPTR, length); } } return Status::OK(); } void Reset() override { ArrayBuilder::Reset(); offsets_builder_.Reset(); value_data_builder_.Reset(); } Status ValidateOverflow(int64_t new_bytes) { auto new_size = value_data_builder_.length() + new_bytes; if (ARROW_PREDICT_FALSE(new_size > memory_limit())) { return Status::CapacityError("array cannot contain more than ", memory_limit(), " bytes, have ", new_size); } else { return Status::OK(); } } Status Resize(int64_t capacity) override { // XXX Why is this check necessary? There is no reason to disallow, say, // binary arrays with more than 2**31 empty or null values. if (capacity > memory_limit()) { return Status::CapacityError("BinaryBuilder cannot reserve space for more than ", memory_limit(), " child elements, got ", capacity); } ARROW_RETURN_NOT_OK(CheckCapacity(capacity)); // One more than requested for offsets ARROW_RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1)); return ArrayBuilder::Resize(capacity); } /// \brief Ensures there is enough allocated capacity to append the indicated /// number of bytes to the value data buffer without additional allocations Status ReserveData(int64_t elements) { ARROW_RETURN_NOT_OK(ValidateOverflow(elements)); return value_data_builder_.Reserve(elements); } Status FinishInternal(std::shared_ptr<ArrayData>* out) override { // Write final offset (values length) ARROW_RETURN_NOT_OK(AppendNextOffset()); // These buffers' padding zeroed by BufferBuilder std::shared_ptr<Buffer> offsets, value_data, null_bitmap; ARROW_RETURN_NOT_OK(offsets_builder_.Finish(&offsets)); ARROW_RETURN_NOT_OK(value_data_builder_.Finish(&value_data)); ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); *out = ArrayData::Make(type(), length_, {null_bitmap, offsets, value_data}, null_count_, 0); Reset(); return Status::OK(); } /// \return data pointer of the value date builder const uint8_t* value_data() const { return value_data_builder_.data(); } /// \return size of values buffer so far int64_t value_data_length() const { return value_data_builder_.length(); } /// \return capacity of values buffer int64_t value_data_capacity() const { return value_data_builder_.capacity(); } /// \return data pointer of the value date builder const offset_type* offsets_data() const { return offsets_builder_.data(); } /// Temporary access to a value. /// /// This pointer becomes invalid on the next modifying operation. const uint8_t* GetValue(int64_t i, offset_type* out_length) const { const offset_type* offsets = offsets_builder_.data(); const auto offset = offsets[i]; if (i == (length_ - 1)) { *out_length = static_cast<offset_type>(value_data_builder_.length()) - offset; } else { *out_length = offsets[i + 1] - offset; } return value_data_builder_.data() + offset; } offset_type offset(int64_t i) const { return offsets_data()[i]; } /// Temporary access to a value. /// /// This view becomes invalid on the next modifying operation. util::string_view GetView(int64_t i) const { offset_type value_length; const uint8_t* value_data = GetValue(i, &value_length); return util::string_view(reinterpret_cast<const char*>(value_data), value_length); } // Cannot make this a static attribute because of linking issues static constexpr int64_t memory_limit() { return std::numeric_limits<offset_type>::max() - 1; } protected: TypedBufferBuilder<offset_type> offsets_builder_; TypedBufferBuilder<uint8_t> value_data_builder_; Status AppendNextOffset() { const int64_t num_bytes = value_data_builder_.length(); return offsets_builder_.Append(static_cast<offset_type>(num_bytes)); } void UnsafeAppendNextOffset() { const int64_t num_bytes = value_data_builder_.length(); offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes)); } };
/// Base class for all Builders that emit an Array of a scalar numerical type. template <typename T> class NumericBuilder : public ArrayBuilder { public: using TypeClass = T; using value_type = typename T::c_type; using ArrayType = typename TypeTraits<T>::ArrayType; template <typename T1 = T> explicit NumericBuilder( enable_if_parameter_free<T1, MemoryPool*> pool = default_memory_pool()) : ArrayBuilder(pool), type_(TypeTraits<T>::type_singleton()), data_builder_(pool) {} NumericBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool) : ArrayBuilder(pool), type_(type), data_builder_(pool) {} /// Append a single scalar and increase the size if necessary. Status Append(const value_type val) { ARROW_RETURN_NOT_OK(ArrayBuilder::Reserve(1)); UnsafeAppend(val); return Status::OK(); } /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory /// The memory at the corresponding data slot is set to 0 to prevent /// uninitialized memory access Status AppendNulls(int64_t length) final { ARROW_RETURN_NOT_OK(Reserve(length)); data_builder_.UnsafeAppend(length, value_type{}); // zero UnsafeSetNull(length); return Status::OK(); } /// \brief Append a single null element Status AppendNull() final { ARROW_RETURN_NOT_OK(Reserve(1)); data_builder_.UnsafeAppend(value_type{}); // zero UnsafeAppendToBitmap(false); return Status::OK(); } /// \brief Append a empty element Status AppendEmptyValue() final { ARROW_RETURN_NOT_OK(Reserve(1)); data_builder_.UnsafeAppend(value_type{}); // zero UnsafeAppendToBitmap(true); return Status::OK(); } /// \brief Append several empty elements Status AppendEmptyValues(int64_t length) final { ARROW_RETURN_NOT_OK(Reserve(length)); data_builder_.UnsafeAppend(length, value_type{}); // zero UnsafeSetNotNull(length); return Status::OK(); } value_type GetValue(int64_t index) const { return data_builder_.data()[index]; } void Reset() override { data_builder_.Reset(); } Status Resize(int64_t capacity) override { ARROW_RETURN_NOT_OK(CheckCapacity(capacity)); capacity = std::max(capacity, kMinBuilderCapacity); ARROW_RETURN_NOT_OK(data_builder_.Resize(capacity)); return ArrayBuilder::Resize(capacity); } value_type operator[](int64_t index) const { return GetValue(index); } value_type& operator[](int64_t index) { return reinterpret_cast<value_type*>(data_builder_.mutable_data())[index]; } /// \brief Append a sequence of elements in one shot /// \param[in] values a contiguous C array of values /// \param[in] length the number of values to append /// \param[in] valid_bytes an optional sequence of bytes where non-zero /// indicates a valid (non-null) value /// \return Status Status AppendValues(const value_type* values, int64_t length, const uint8_t* valid_bytes = NULLPTR) { ARROW_RETURN_NOT_OK(Reserve(length)); data_builder_.UnsafeAppend(values, length); // length_ is update by these ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length); return Status::OK(); } /// \brief Append a sequence of elements in one shot /// \param[in] values a contiguous C array of values /// \param[in] length the number of values to append /// \param[in] is_valid an std::vector<bool> indicating valid (1) or null /// (0). Equal in length to values /// \return Status Status AppendValues(const value_type* values, int64_t length, const std::vector<bool>& is_valid) { ARROW_RETURN_NOT_OK(Reserve(length)); data_builder_.UnsafeAppend(values, length); // length_ is update by these ArrayBuilder::UnsafeAppendToBitmap(is_valid); return Status::OK(); } /// \brief Append a sequence of elements in one shot /// \param[in] values a std::vector of values /// \param[in] is_valid an std::vector<bool> indicating valid (1) or null /// (0). Equal in length to values /// \return Status Status AppendValues(const std::vector<value_type>& values, const std::vector<bool>& is_valid) { return AppendValues(values.data(), static_cast<int64_t>(values.size()), is_valid); } /// \brief Append a sequence of elements in one shot /// \param[in] values a std::vector of values /// \return Status Status AppendValues(const std::vector<value_type>& values) { return AppendValues(values.data(), static_cast<int64_t>(values.size())); } Status FinishInternal(std::shared_ptr<ArrayData>* out) override { std::shared_ptr<Buffer> data, null_bitmap; ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); ARROW_RETURN_NOT_OK(data_builder_.Finish(&data)); *out = ArrayData::Make(type(), length_, {null_bitmap, data}, null_count_); capacity_ = length_ = null_count_ = 0; return Status::OK(); } /// \cond FALSE using ArrayBuilder::Finish; /// \endcond Status Finish(std::shared_ptr<ArrayType>* out) { return FinishTyped(out); } /// \brief Append a sequence of elements in one shot /// \param[in] values_begin InputIterator to the beginning of the values /// \param[in] values_end InputIterator pointing to the end of the values /// \return Status template <typename ValuesIter> Status AppendValues(ValuesIter values_begin, ValuesIter values_end) { int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end)); ARROW_RETURN_NOT_OK(Reserve(length)); data_builder_.UnsafeAppend(values_begin, values_end); // this updates the length_ UnsafeSetNotNull(length); return Status::OK(); } /// \brief Append a sequence of elements in one shot, with a specified nullmap /// \param[in] values_begin InputIterator to the beginning of the values /// \param[in] values_end InputIterator pointing to the end of the values /// \param[in] valid_begin InputIterator with elements indication valid(1) /// or null(0) values. /// \return Status template <typename ValuesIter, typename ValidIter> enable_if_t<!std::is_pointer<ValidIter>::value, Status> AppendValues( ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { static_assert(!internal::is_null_pointer<ValidIter>::value, "Don't pass a NULLPTR directly as valid_begin, use the 2-argument " "version instead"); int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end)); ARROW_RETURN_NOT_OK(Reserve(length)); data_builder_.UnsafeAppend(values_begin, values_end); null_bitmap_builder_.UnsafeAppend<true>( length, [&valid_begin]() -> bool { return *valid_begin++; }); length_ = null_bitmap_builder_.length(); null_count_ = null_bitmap_builder_.false_count(); return Status::OK(); } // Same as above, with a pointer type ValidIter template <typename ValuesIter, typename ValidIter> enable_if_t<std::is_pointer<ValidIter>::value, Status> AppendValues( ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end)); ARROW_RETURN_NOT_OK(Reserve(length)); data_builder_.UnsafeAppend(values_begin, values_end); // this updates the length_ if (valid_begin == NULLPTR) { UnsafeSetNotNull(length); } else { null_bitmap_builder_.UnsafeAppend<true>( length, [&valid_begin]() -> bool { return *valid_begin++; }); length_ = null_bitmap_builder_.length(); null_count_ = null_bitmap_builder_.false_count(); } return Status::OK(); } /// Append a single scalar under the assumption that the underlying Buffer is /// large enough. /// /// This method does not capacity-check; make sure to call Reserve /// beforehand. void UnsafeAppend(const value_type val) { ArrayBuilder::UnsafeAppendToBitmap(true); data_builder_.UnsafeAppend(val); } void UnsafeAppendNull() { ArrayBuilder::UnsafeAppendToBitmap(false); data_builder_.UnsafeAppend(value_type{}); // zero } std::shared_ptr<DataType> type() const override { return type_; } protected: std::shared_ptr<DataType> type_; TypedBufferBuilder<value_type> data_builder_; };
/// Base class for all data array builders. /// /// This class provides a facilities for incrementally building the null bitmap /// (see Append methods) and as a side effect the current number of slots and /// the null count. /// /// \note Users are expected to use builders as one of the concrete types below. /// For example, ArrayBuilder* pointing to BinaryBuilder should be downcast before use. class ARROW_EXPORT ArrayBuilder { public: explicit ArrayBuilder(MemoryPool* pool) : pool_(pool), null_bitmap_builder_(pool) {} virtual ~ArrayBuilder() = default; /// For nested types. Since the objects are owned by this class instance, we /// skip shared pointers and just return a raw pointer ArrayBuilder* child(int i) { return children_[i].get(); } const std::shared_ptr<ArrayBuilder>& child_builder(int i) const { return children_[i]; } int num_children() const { return static_cast<int>(children_.size()); } virtual int64_t length() const { return length_; } int64_t null_count() const { return null_count_; } int64_t capacity() const { return capacity_; } /// \brief Ensure that enough memory has been allocated to fit the indicated /// number of total elements in the builder, including any that have already /// been appended. Does not account for reallocations that may be due to /// variable size data, like binary values. To make space for incremental /// appends, use Reserve instead. /// /// \param[in] capacity the minimum number of total array values to /// accommodate. Must be greater than the current capacity. /// \return Status virtual Status Resize(int64_t capacity); /// \brief Ensure that there is enough space allocated to append the indicated /// number of elements without any further reallocation. Overallocation is /// used in order to minimize the impact of incremental Reserve() calls. /// Note that additional_capacity is relative to the current number of elements /// rather than to the current capacity, so calls to Reserve() which are not /// interspersed with addition of new elements may not increase the capacity. /// /// \param[in] additional_capacity the number of additional array values /// \return Status Status Reserve(int64_t additional_capacity) { auto current_capacity = capacity(); auto min_capacity = length() + additional_capacity; if (min_capacity <= current_capacity) return Status::OK(); // leave growth factor up to BufferBuilder auto new_capacity = BufferBuilder::GrowByFactor(current_capacity, min_capacity); return Resize(new_capacity); } /// Reset the builder. virtual void Reset(); /// \brief Append a null value to builder virtual Status AppendNull() = 0; /// \brief Append a number of null values to builder virtual Status AppendNulls(int64_t length) = 0; /// \brief Append a non-null value to builder /// /// The appended value is an implementation detail, but the corresponding /// memory slot is guaranteed to be initialized. /// This method is useful when appending a null value to a parent nested type. virtual Status AppendEmptyValue() = 0; /// \brief Append a number of non-null values to builder /// /// The appended values are an implementation detail, but the corresponding /// memory slot is guaranteed to be initialized. /// This method is useful when appending null values to a parent nested type. virtual Status AppendEmptyValues(int64_t length) = 0; /// For cases where raw data was memcpy'd into the internal buffers, allows us /// to advance the length of the builder. It is your responsibility to use /// this function responsibly. Status Advance(int64_t elements); /// \brief Return result of builder as an internal generic ArrayData /// object. Resets builder except for dictionary builder /// /// \param[out] out the finalized ArrayData object /// \return Status virtual Status FinishInternal(std::shared_ptr<ArrayData>* out) = 0; /// \brief Return result of builder as an Array object. /// /// The builder is reset except for DictionaryBuilder. /// /// \param[out] out the finalized Array object /// \return Status Status Finish(std::shared_ptr<Array>* out); /// \brief Return result of builder as an Array object. /// /// The builder is reset except for DictionaryBuilder. /// /// \return The finalized Array object Result<std::shared_ptr<Array>> Finish(); /// \brief Return the type of the built Array virtual std::shared_ptr<DataType> type() const = 0; protected: /// Append to null bitmap Status AppendToBitmap(bool is_valid); /// Vector append. Treat each zero byte as a null. If valid_bytes is null /// assume all of length bits are valid. Status AppendToBitmap(const uint8_t* valid_bytes, int64_t length); /// Uniform append. Append N times the same validity bit. Status AppendToBitmap(int64_t num_bits, bool value); /// Set the next length bits to not null (i.e. valid). Status SetNotNull(int64_t length); // Unsafe operations (don't check capacity/don't resize) void UnsafeAppendNull() { UnsafeAppendToBitmap(false); } // Append to null bitmap, update the length void UnsafeAppendToBitmap(bool is_valid) { null_bitmap_builder_.UnsafeAppend(is_valid); ++length_; if (!is_valid) ++null_count_; } // Vector append. Treat each zero byte as a nullzero. If valid_bytes is null // assume all of length bits are valid. void UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length) { if (valid_bytes == NULLPTR) { return UnsafeSetNotNull(length); } null_bitmap_builder_.UnsafeAppend(valid_bytes, length); length_ += length; null_count_ = null_bitmap_builder_.false_count(); } // Append the same validity value a given number of times. void UnsafeAppendToBitmap(const int64_t num_bits, bool value) { if (value) { UnsafeSetNotNull(num_bits); } else { UnsafeSetNull(num_bits); } } void UnsafeAppendToBitmap(const std::vector<bool>& is_valid); // Set the next validity bits to not null (i.e. valid). void UnsafeSetNotNull(int64_t length); // Set the next validity bits to null (i.e. invalid). void UnsafeSetNull(int64_t length); static Status TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer); /// \brief Finish to an array of the specified ArrayType template <typename ArrayType> Status FinishTyped(std::shared_ptr<ArrayType>* out) { std::shared_ptr<Array> out_untyped; ARROW_RETURN_NOT_OK(Finish(&out_untyped)); *out = std::static_pointer_cast<ArrayType>(std::move(out_untyped)); return Status::OK(); } // Check the requested capacity for validity Status CheckCapacity(int64_t new_capacity) { if (ARROW_PREDICT_FALSE(new_capacity < 0)) { return Status::Invalid( "Resize capacity must be positive (requested: ", new_capacity, ")"); } if (ARROW_PREDICT_FALSE(new_capacity < length_)) { return Status::Invalid("Resize cannot downsize (requested: ", new_capacity, ", current length: ", length_, ")"); } return Status::OK(); } // Check for array type Status CheckArrayType(const std::shared_ptr<DataType>& expected_type, const Array& array, const char* message); Status CheckArrayType(Type::type expected_type, const Array& array, const char* message); MemoryPool* pool_; TypedBufferBuilder<bool> null_bitmap_builder_; int64_t null_count_ = 0; // Array length, so far. Also, the index of the next element to be added int64_t length_ = 0; int64_t capacity_ = 0; // Child value array builders. These are owned by this class std::vector<std::shared_ptr<ArrayBuilder>> children_; private: ARROW_DISALLOW_COPY_AND_ASSIGN(ArrayBuilder); };
/// Concrete Array class for numeric data. template <typename TYPE> class NumericArray : public PrimitiveArray { public: using TypeClass = TYPE; using value_type = typename TypeClass::c_type; using IteratorType = stl::ArrayIterator<NumericArray<TYPE>>; explicit NumericArray(const std::shared_ptr<ArrayData>& data) : PrimitiveArray(data) {} // Only enable this constructor without a type argument for types without additional // metadata template <typename T1 = TYPE> NumericArray(enable_if_parameter_free<T1, int64_t> length, const std::shared_ptr<Buffer>& data, const std::shared_ptr<Buffer>& null_bitmap = NULLPTR, int64_t null_count = kUnknownNullCount, int64_t offset = 0) : PrimitiveArray(TypeTraits<T1>::type_singleton(), length, data, null_bitmap, null_count, offset) {} const value_type* raw_values() const { return reinterpret_cast<const value_type*>(raw_values_) + data_->offset; } value_type Value(int64_t i) const { return raw_values()[i]; } // For API compatibility with BinaryArray etc. value_type GetView(int64_t i) const { return Value(i); } IteratorType begin() const { return IteratorType(*this); } IteratorType end() const { return IteratorType(*this, length()); } protected: using PrimitiveArray::PrimitiveArray; };
arrow/cpp/src/arrow/array/array_binary.h
// ---------------------------------------------------------------------- // Binary and String /// Base class for variable-sized binary arrays, regardless of offset size /// and logical interpretation. template <typename TYPE> class BaseBinaryArray : public FlatArray { public: using TypeClass = TYPE; using offset_type = typename TypeClass::offset_type; using IteratorType = stl::ArrayIterator<BaseBinaryArray<TYPE>>; /// Return the pointer to the given elements bytes // XXX should GetValue(int64_t i) return a string_view? const uint8_t* GetValue(int64_t i, offset_type* out_length) const { // Account for base offset i += data_->offset; const offset_type pos = raw_value_offsets_[i]; *out_length = raw_value_offsets_[i + 1] - pos; return raw_data_ + pos; } /// \brief Get binary value as a string_view /// /// \param i the value index /// \return the view over the selected value util::string_view GetView(int64_t i) const { // Account for base offset i += data_->offset; const offset_type pos = raw_value_offsets_[i]; return util::string_view(reinterpret_cast<const char*>(raw_data_ + pos), raw_value_offsets_[i + 1] - pos); } /// \brief Get binary value as a std::string /// /// \param i the value index /// \return the value copied into a std::string std::string GetString(int64_t i) const { return std::string(GetView(i)); } /// Note that this buffer does not account for any slice offset std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[1]; } /// Note that this buffer does not account for any slice offset std::shared_ptr<Buffer> value_data() const { return data_->buffers[2]; } const offset_type* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; } const uint8_t* raw_data() const { return raw_data_; } /// \brief Return the data buffer absolute offset of the data for the value /// at the passed index. /// /// Does not perform boundschecking offset_type value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; } /// \brief Return the length of the data for the value at the passed index. /// /// Does not perform boundschecking offset_type value_length(int64_t i) const { i += data_->offset; return raw_value_offsets_[i + 1] - raw_value_offsets_[i]; } /// \brief Return the total length of the memory in the data buffer /// referenced by this array. If the array has been sliced then this may be /// less than the size of the data buffer (data_->buffers[2]). offset_type total_values_length() const { if (data_->length > 0) { return raw_value_offsets_[data_->length + data_->offset] - raw_value_offsets_[data_->offset]; } else { return 0; } } IteratorType begin() const { return IteratorType(*this); } IteratorType end() const { return IteratorType(*this, length()); } protected: // For subclasses BaseBinaryArray() = default; // Protected method for constructors void SetData(const std::shared_ptr<ArrayData>& data) { this->Array::SetData(data); raw_value_offsets_ = data->GetValuesSafe<offset_type>(1, /*offset=*/0); raw_data_ = data->GetValuesSafe<uint8_t>(2, /*offset=*/0); } const offset_type* raw_value_offsets_ = NULLPTR; const uint8_t* raw_data_ = NULLPTR; };
arrow/cpp/src/arrow/array/array_base.h
/// Base class for non-nested arrays
class ARROW_EXPORT FlatArray : public Array {
protected:
using Array::Array;
};
/// Concrete Array class for list data class ARROW_EXPORT ListArray : public BaseListArray<ListType> { public: explicit ListArray(std::shared_ptr<ArrayData> data); ListArray(std::shared_ptr<DataType> type, int64_t length, std::shared_ptr<Buffer> value_offsets, std::shared_ptr<Array> values, std::shared_ptr<Buffer> null_bitmap = NULLPTR, int64_t null_count = kUnknownNullCount, int64_t offset = 0); /// \brief Construct ListArray from array of offsets and child value array /// /// This function does the bare minimum of validation of the offsets and /// input types, and will allocate a new offsets array if necessary (i.e. if /// the offsets contain any nulls). If the offsets do not have nulls, they /// are assumed to be well-formed /// /// \param[in] offsets Array containing n + 1 offsets encoding length and /// size. Must be of int32 type /// \param[in] values Array containing list values /// \param[in] pool MemoryPool in case new offsets array needs to be /// allocated because of null values static Result<std::shared_ptr<ListArray>> FromArrays( const Array& offsets, const Array& values, MemoryPool* pool = default_memory_pool()); /// \brief Return an Array that is a concatenation of the lists in this array. /// /// Note that it's different from `values()` in that it takes into /// consideration of this array's offsets as well as null elements backed /// by non-empty lists (they are skipped, thus copying may be needed). Result<std::shared_ptr<Array>> Flatten( MemoryPool* memory_pool = default_memory_pool()) const; /// \brief Return list offsets as an Int32Array std::shared_ptr<Array> offsets() const; protected: // This constructor defers SetData to a derived array class ListArray() = default; void SetData(const std::shared_ptr<ArrayData>& data); };
/// Base class for variable-sized list arrays, regardless of offset size. template <typename TYPE> class BaseListArray : public Array { public: using TypeClass = TYPE; using offset_type = typename TypeClass::offset_type; const TypeClass* list_type() const { return list_type_; } /// \brief Return array object containing the list's values std::shared_ptr<Array> values() const { return values_; } /// Note that this buffer does not account for any slice offset std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[1]; } std::shared_ptr<DataType> value_type() const { return list_type_->value_type(); } /// Return pointer to raw value offsets accounting for any slice offset const offset_type* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; } // The following functions will not perform boundschecking offset_type value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; } offset_type value_length(int64_t i) const { i += data_->offset; return raw_value_offsets_[i + 1] - raw_value_offsets_[i]; } std::shared_ptr<Array> value_slice(int64_t i) const { return values_->Slice(value_offset(i), value_length(i)); } protected: friend void internal::SetListData<TYPE>(BaseListArray<TYPE>* self, const std::shared_ptr<ArrayData>& data, Type::type expected_type_id); const TypeClass* list_type_ = NULLPTR; std::shared_ptr<Array> values_; const offset_type* raw_value_offsets_ = NULLPTR; };
// Private helper for ListArray::SetData.
// Unfortunately, trying to define BaseListArray::SetData outside of this header
// doesn't play well with MSVC.
template <typename TYPE>
void SetListData(BaseListArray<TYPE>* self, const std::shared_ptr<ArrayData>& data,
Type::type expected_type_id = TYPE::type_id);
} // namespace internal
template <typename TYPE> inline void SetListData(BaseListArray<TYPE>* self, const std::shared_ptr<ArrayData>& data, Type::type expected_type_id) { ARROW_CHECK_EQ(data->buffers.size(), 2); ARROW_CHECK_EQ(data->type->id(), expected_type_id); ARROW_CHECK_EQ(data->child_data.size(), 1); self->Array::SetData(data); self->list_type_ = checked_cast<const TYPE*>(data->type.get()); self->raw_value_offsets_ = data->GetValuesSafe<typename TYPE::offset_type>(1, /*offset=*/0); ARROW_CHECK_EQ(self->list_type_->value_type()->id(), data->child_data[0]->type->id()); DCHECK(self->list_type_->value_type()->Equals(data->child_data[0]->type)); self->values_ = MakeArray(self->data_->child_data[0]); }
void ListArray::SetData(const std::shared_ptr<ArrayData>& data) {
internal::SetListData(this, data);
}
ListArray::ListArray(std::shared_ptr<ArrayData> data) { SetData(std::move(data)); }
The key implementation of recursion is here:
“std::shared_ptr<ArrayData> items;”
Status FinishInternal(std::shared_ptr<ArrayData>* out) override { ARROW_RETURN_NOT_OK(AppendNextOffset()); // Offset padding zeroed by BufferBuilder std::shared_ptr<Buffer> offsets, null_bitmap; ARROW_RETURN_NOT_OK(offsets_builder_.Finish(&offsets)); ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); if (value_builder_->length() == 0) { // Try to make sure we get a non-null values buffer (ARROW-2744) ARROW_RETURN_NOT_OK(value_builder_->Resize(0)); } std::shared_ptr<ArrayData> items; ARROW_RETURN_NOT_OK(value_builder_->FinishInternal(&items)); *out = ArrayData::Make(type(), length_, {null_bitmap, offsets}, {std::move(items)}, null_count_); Reset(); return Status::OK(); }
arrow/cpp/src/arrow/array/array_base.h
// User array accessor types /// \brief Array base type /// Immutable data array with some logical type and some length. /// /// Any memory is owned by the respective Buffer instance (or its parents). /// /// The base class is only required to have a null bitmap buffer if the null /// count is greater than 0 /// /// If known, the null count can be provided in the base Array constructor. If /// the null count is not known, pass -1 to indicate that the null count is to /// be computed on the first call to null_count() class ARROW_EXPORT Array { public: virtual ~Array() = default; /// \brief Return true if value at index is null. Does not boundscheck bool IsNull(int64_t i) const { return null_bitmap_data_ != NULLPTR && !BitUtil::GetBit(null_bitmap_data_, i + data_->offset); } /// \brief Return true if value at index is valid (not null). Does not /// boundscheck bool IsValid(int64_t i) const { return null_bitmap_data_ == NULLPTR || BitUtil::GetBit(null_bitmap_data_, i + data_->offset); } /// \brief Return a Scalar containing the value of this array at i Result<std::shared_ptr<Scalar>> GetScalar(int64_t i) const; /// Size in the number of elements this array contains. int64_t length() const { return data_->length; } /// A relative position into another array's data, to enable zero-copy /// slicing. This value defaults to zero int64_t offset() const { return data_->offset; } /// The number of null entries in the array. If the null count was not known /// at time of construction (and set to a negative value), then the null /// count will be computed and cached on the first invocation of this /// function int64_t null_count() const; std::shared_ptr<DataType> type() const { return data_->type; } Type::type type_id() const { return data_->type->id(); } /// Buffer for the validity (null) bitmap, if any. Note that Union types /// never have a null bitmap. /// /// Note that for `null_count == 0` or for null type, this will be null. /// This buffer does not account for any slice offset const std::shared_ptr<Buffer>& null_bitmap() const { return data_->buffers[0]; } /// Raw pointer to the null bitmap. /// /// Note that for `null_count == 0` or for null type, this will be null. /// This buffer does not account for any slice offset const uint8_t* null_bitmap_data() const { return null_bitmap_data_; } /// Equality comparison with another array bool Equals(const Array& arr, const EqualOptions& = EqualOptions::Defaults()) const; bool Equals(const std::shared_ptr<Array>& arr, const EqualOptions& = EqualOptions::Defaults()) const; /// \brief Return the formatted unified diff of arrow::Diff between this /// Array and another Array std::string Diff(const Array& other) const; /// Approximate equality comparison with another array /// /// epsilon is only used if this is FloatArray or DoubleArray bool ApproxEquals(const std::shared_ptr<Array>& arr, const EqualOptions& = EqualOptions::Defaults()) const; bool ApproxEquals(const Array& arr, const EqualOptions& = EqualOptions::Defaults()) const; /// Compare if the range of slots specified are equal for the given array and /// this array. end_idx exclusive. This methods does not bounds check. bool RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx, const Array& other, const EqualOptions& = EqualOptions::Defaults()) const; bool RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx, const std::shared_ptr<Array>& other, const EqualOptions& = EqualOptions::Defaults()) const; bool RangeEquals(const Array& other, int64_t start_idx, int64_t end_idx, int64_t other_start_idx, const EqualOptions& = EqualOptions::Defaults()) const; bool RangeEquals(const std::shared_ptr<Array>& other, int64_t start_idx, int64_t end_idx, int64_t other_start_idx, const EqualOptions& = EqualOptions::Defaults()) const; Status Accept(ArrayVisitor* visitor) const; /// Construct a zero-copy view of this array with the given type. /// /// This method checks if the types are layout-compatible. /// Nested types are traversed in depth-first order. Data buffers must have /// the same item sizes, even though the logical types may be different. /// An error is returned if the types are not layout-compatible. Result<std::shared_ptr<Array>> View(const std::shared_ptr<DataType>& type) const; /// Construct a zero-copy slice of the array with the indicated offset and /// length /// /// \param[in] offset the position of the first element in the constructed /// slice /// \param[in] length the length of the slice. If there are not enough /// elements in the array, the length will be adjusted accordingly /// /// \return a new object wrapped in std::shared_ptr<Array> std::shared_ptr<Array> Slice(int64_t offset, int64_t length) const; /// Slice from offset until end of the array std::shared_ptr<Array> Slice(int64_t offset) const; /// Input-checking variant of Array::Slice Result<std::shared_ptr<Array>> SliceSafe(int64_t offset, int64_t length) const; /// Input-checking variant of Array::Slice Result<std::shared_ptr<Array>> SliceSafe(int64_t offset) const; const std::shared_ptr<ArrayData>& data() const { return data_; } int num_fields() const { return static_cast<int>(data_->child_data.size()); } /// \return PrettyPrint representation of array suitable for debugging std::string ToString() const; /// \brief Perform cheap validation checks to determine obvious inconsistencies /// within the array's internal data. /// /// This is O(k) where k is the number of descendents. /// /// \return Status Status Validate() const; /// \brief Perform extensive validation checks to determine inconsistencies /// within the array's internal data. /// /// This is potentially O(k*n) where k is the number of descendents and n /// is the array length. /// /// \return Status Status ValidateFull() const; protected: Array() : null_bitmap_data_(NULLPTR) {} std::shared_ptr<ArrayData> data_; const uint8_t* null_bitmap_data_; /// Protected method for constructors void SetData(const std::shared_ptr<ArrayData>& data) { if (data->buffers.size() > 0) { null_bitmap_data_ = data->GetValuesSafe<uint8_t>(0, /*offset=*/0); } else { null_bitmap_data_ = NULLPTR; } data_ = data; } private: ARROW_DISALLOW_COPY_AND_ASSIGN(Array); };
arrow/cpp/src/arrow/buffer_builder.h
// ---------------------------------------------------------------------- // Buffer builder classes /// \class BufferBuilder /// \brief A class for incrementally building a contiguous chunk of in-memory /// data class ARROW_EXPORT BufferBuilder { public: explicit BufferBuilder(MemoryPool* pool = default_memory_pool()) : pool_(pool), data_(/*ensure never null to make ubsan happy and avoid check penalties below*/ &util::internal::non_null_filler), capacity_(0), size_(0) {} /// \brief Constructs new Builder that will start using /// the provided buffer until Finish/Reset are called. /// The buffer is not resized. explicit BufferBuilder(std::shared_ptr<ResizableBuffer> buffer, MemoryPool* pool = default_memory_pool()) : buffer_(std::move(buffer)), pool_(pool), data_(buffer_->mutable_data()), capacity_(buffer_->capacity()), size_(buffer_->size()) {} /// \brief Resize the buffer to the nearest multiple of 64 bytes /// /// \param new_capacity the new capacity of the of the builder. Will be /// rounded up to a multiple of 64 bytes for padding \param shrink_to_fit if /// new capacity is smaller than the existing size, reallocate internal /// buffer. Set to false to avoid reallocations when shrinking the builder. /// \return Status Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) { // Resize(0) is a no-op if (new_capacity == 0) { return Status::OK(); } if (buffer_ == NULLPTR) { ARROW_ASSIGN_OR_RAISE(buffer_, AllocateResizableBuffer(new_capacity, pool_)); } else { ARROW_RETURN_NOT_OK(buffer_->Resize(new_capacity, shrink_to_fit)); } capacity_ = buffer_->capacity(); data_ = buffer_->mutable_data(); return Status::OK(); } /// \brief Ensure that builder can accommodate the additional number of bytes /// without the need to perform allocations /// /// \param[in] additional_bytes number of additional bytes to make space for /// \return Status Status Reserve(const int64_t additional_bytes) { auto min_capacity = size_ + additional_bytes; if (min_capacity <= capacity_) { return Status::OK(); } return Resize(GrowByFactor(capacity_, min_capacity), false); } /// \brief Return a capacity expanded by the desired growth factor static int64_t GrowByFactor(int64_t current_capacity, int64_t new_capacity) { // Doubling capacity except for large Reserve requests. 2x growth strategy // (versus 1.5x) seems to have slightly better performance when using // jemalloc, but significantly better performance when using the system // allocator. See ARROW-6450 for further discussion return std::max(new_capacity, current_capacity * 2); } /// \brief Append the given data to the buffer /// /// The buffer is automatically expanded if necessary. Status Append(const void* data, const int64_t length) { if (ARROW_PREDICT_FALSE(size_ + length > capacity_)) { ARROW_RETURN_NOT_OK(Resize(GrowByFactor(capacity_, size_ + length), false)); } UnsafeAppend(data, length); return Status::OK(); } /// \brief Append copies of a value to the buffer /// /// The buffer is automatically expanded if necessary. Status Append(const int64_t num_copies, uint8_t value) { ARROW_RETURN_NOT_OK(Reserve(num_copies)); UnsafeAppend(num_copies, value); return Status::OK(); } // Advance pointer and zero out memory Status Advance(const int64_t length) { return Append(length, 0); } // Advance pointer, but don't allocate or zero memory void UnsafeAdvance(const int64_t length) { size_ += length; } // Unsafe methods don't check existing size void UnsafeAppend(const void* data, const int64_t length) { memcpy(data_ + size_, data, static_cast<size_t>(length)); size_ += length; } void UnsafeAppend(const int64_t num_copies, uint8_t value) { memset(data_ + size_, value, static_cast<size_t>(num_copies)); size_ += num_copies; } /// \brief Return result of builder as a Buffer object. /// /// The builder is reset and can be reused afterwards. /// /// \param[out] out the finalized Buffer object /// \param shrink_to_fit if the buffer size is smaller than its capacity, /// reallocate to fit more tightly in memory. Set to false to avoid /// a reallocation, at the expense of potentially more memory consumption. /// \return Status Status Finish(std::shared_ptr<Buffer>* out, bool shrink_to_fit = true) { ARROW_RETURN_NOT_OK(Resize(size_, shrink_to_fit)); if (size_ != 0) buffer_->ZeroPadding(); *out = buffer_; if (*out == NULLPTR) { ARROW_ASSIGN_OR_RAISE(*out, AllocateBuffer(0, pool_)); } Reset(); return Status::OK(); } Result<std::shared_ptr<Buffer>> Finish(bool shrink_to_fit = true) { std::shared_ptr<Buffer> out; ARROW_RETURN_NOT_OK(Finish(&out, shrink_to_fit)); return out; } void Reset() { buffer_ = NULLPTR; capacity_ = size_ = 0; } /// \brief Set size to a smaller value without modifying builder /// contents. For reusable BufferBuilder classes /// \param[in] position must be non-negative and less than or equal /// to the current length() void Rewind(int64_t position) { size_ = position; } int64_t capacity() const { return capacity_; } int64_t length() const { return size_; } const uint8_t* data() const { return data_; } uint8_t* mutable_data() { return data_; } private: std::shared_ptr<ResizableBuffer> buffer_; MemoryPool* pool_; uint8_t* data_; int64_t capacity_; int64_t size_; };
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。