1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
1718use crate::arrow::record_reader::buffer::ValuesBuffer;
19use arrow_array::{builder::make_view, make_array, ArrayRef};
20use arrow_buffer::Buffer;
21use arrow_data::ArrayDataBuilder;
22use arrow_schema::DataType as ArrowType;
2324/// A buffer of view type byte arrays that can be converted into
25/// `GenericByteViewArray`
26///
27/// Note this does not reuse `GenericByteViewBuilder` due to the need to call `pad_nulls`
28/// and reuse the existing logic for Vec in the parquet crate
29#[derive(Debug, Default)]
30pub struct ViewBuffer {
31pub views: Vec<u128>,
32pub buffers: Vec<Buffer>,
33}
3435impl ViewBuffer {
36pub fn is_empty(&self) -> bool {
37self.views.is_empty()
38 }
3940pub fn append_block(&mut self, block: Buffer) -> u32 {
41let block_id = self.buffers.len() as u32;
42self.buffers.push(block);
43 block_id
44 }
4546/// # Safety
47 /// This method is only safe when:
48 /// - `block` is a valid index, i.e., the return value of `append_block`
49 /// - `offset` and `offset + len` are valid indices into the buffer
50 /// - The `(offset, offset + len)` is valid value for the native type.
51pub unsafe fn append_view_unchecked(&mut self, block: u32, offset: u32, len: u32) {
52let b = self.buffers.get_unchecked(block as usize);
53let end = offset.saturating_add(len);
54let b = b.get_unchecked(offset as usize..end as usize);
5556let view = make_view(b, block, offset);
5758self.views.push(view);
59 }
6061/// Directly append a view to the view array.
62 /// This is used when we create a StringViewArray from a dictionary whose values are StringViewArray.
63 ///
64 /// # Safety
65 /// The `view` must be a valid view as per the ByteView spec.
66pub unsafe fn append_raw_view_unchecked(&mut self, view: &u128) {
67self.views.push(*view);
68 }
6970/// Converts this into an [`ArrayRef`] with the provided `data_type` and `null_buffer`
71pub fn into_array(self, null_buffer: Option<Buffer>, data_type: &ArrowType) -> ArrayRef {
72let len = self.views.len();
73let views = Buffer::from_vec(self.views);
74match data_type {
75 ArrowType::Utf8View => {
76let builder = ArrayDataBuilder::new(ArrowType::Utf8View)
77 .len(len)
78 .add_buffer(views)
79 .add_buffers(self.buffers)
80 .null_bit_buffer(null_buffer);
81// We have checked that the data is utf8 when building the buffer, so it is safe
82let array = unsafe { builder.build_unchecked() };
83 make_array(array)
84 }
85 ArrowType::BinaryView => {
86let builder = ArrayDataBuilder::new(ArrowType::BinaryView)
87 .len(len)
88 .add_buffer(views)
89 .add_buffers(self.buffers)
90 .null_bit_buffer(null_buffer);
91let array = unsafe { builder.build_unchecked() };
92 make_array(array)
93 }
94_ => panic!("Unsupported data type: {:?}", data_type),
95 }
96 }
97}
9899impl ValuesBuffer for ViewBuffer {
100fn pad_nulls(
101&mut self,
102 read_offset: usize,
103 values_read: usize,
104 levels_read: usize,
105 valid_mask: &[u8],
106 ) {
107self.views
108 .pad_nulls(read_offset, values_read, levels_read, valid_mask);
109 }
110}
111112#[cfg(test)]
113mod tests {
114115use arrow_array::Array;
116117use super::*;
118119#[test]
120fn test_view_buffer_empty() {
121let buffer = ViewBuffer::default();
122let array = buffer.into_array(None, &ArrowType::Utf8View);
123let strings = array
124 .as_any()
125 .downcast_ref::<arrow::array::StringViewArray>()
126 .unwrap();
127assert_eq!(strings.len(), 0);
128 }
129130#[test]
131fn test_view_buffer_append_view() {
132let mut buffer = ViewBuffer::default();
133let string_buffer = Buffer::from(&b"0123456789long string to test string view"[..]);
134let block_id = buffer.append_block(string_buffer);
135136unsafe {
137 buffer.append_view_unchecked(block_id, 0, 1);
138 buffer.append_view_unchecked(block_id, 1, 9);
139 buffer.append_view_unchecked(block_id, 10, 31);
140 }
141142let array = buffer.into_array(None, &ArrowType::Utf8View);
143let string_array = array
144 .as_any()
145 .downcast_ref::<arrow::array::StringViewArray>()
146 .unwrap();
147assert_eq!(
148 string_array.iter().collect::<Vec<_>>(),
149vec![
150Some("0"),
151Some("123456789"),
152Some("long string to test string view"),
153 ]
154 );
155 }
156157#[test]
158fn test_view_buffer_pad_null() {
159let mut buffer = ViewBuffer::default();
160let string_buffer = Buffer::from(&b"0123456789long string to test string view"[..]);
161let block_id = buffer.append_block(string_buffer);
162163unsafe {
164 buffer.append_view_unchecked(block_id, 0, 1);
165 buffer.append_view_unchecked(block_id, 1, 9);
166 buffer.append_view_unchecked(block_id, 10, 31);
167 }
168169let valid = [true, false, false, true, false, false, true];
170let valid_mask = Buffer::from_iter(valid.iter().copied());
171172 buffer.pad_nulls(1, 2, valid.len() - 1, valid_mask.as_slice());
173174let array = buffer.into_array(Some(valid_mask), &ArrowType::Utf8View);
175let strings = array
176 .as_any()
177 .downcast_ref::<arrow::array::StringViewArray>()
178 .unwrap();
179180assert_eq!(
181 strings.iter().collect::<Vec<_>>(),
182vec![
183Some("0"),
184None,
185None,
186Some("123456789"),
187None,
188None,
189Some("long string to test string view"),
190 ]
191 );
192 }
193}