use std::collections::BTreeMap;
use timely::container::PushInto;
use crate::trace::implementations::{BatchContainer, OffsetList};
use self::wrapper::Wrapped;
use self::encoded::Encoded;
use self::huffman::Huffman;
pub struct HuffmanContainer<B: Ord+Clone> {
inner: Result<(Huffman<B>, Vec<u8>), Vec<B>>,
offsets: OffsetList,
stats: BTreeMap<B, i64>
}
impl<B> HuffmanContainer<B>
where
B: Ord + Clone,
{
pub fn print(&self) {
if let Ok((_huff, bytes)) = &self.inner {
println!("Bytes: {:?}, Symbols: {:?}", bytes.len(), self.stats.values().sum::<i64>());
}
}
}
impl<B: Ord + Clone + 'static> PushInto<Vec<B>> for HuffmanContainer<B> {
fn push_into(&mut self, item: Vec<B>) {
for x in item.iter() { *self.stats.entry(x.clone()).or_insert(0) += 1; }
match &mut self.inner {
Ok((huffman, bytes)) => {
bytes.extend(huffman.encode(item.iter()));
self.offsets.push(bytes.len());
},
Err(raw) => {
raw.extend(item);
self.offsets.push(raw.len());
}
}
}
}
impl<'a, B: Ord + Clone + 'static> PushInto<Wrapped<'a, B>> for HuffmanContainer<B> {
fn push_into(&mut self, item: Wrapped<'a, B>) {
match item.decode() {
Ok(decoded) => {
for x in decoded { *self.stats.entry(x.clone()).or_insert(0) += 1; }
},
Err(symbols) => {
for x in symbols.iter() { *self.stats.entry(x.clone()).or_insert(0) += 1; }
}
}
match (item.decode(), &mut self.inner) {
(Ok(decoded), Ok((huffman, bytes))) => {
bytes.extend(huffman.encode(decoded));
self.offsets.push(bytes.len());
}
(Ok(decoded), Err(raw)) => {
raw.extend(decoded.cloned());
self.offsets.push(raw.len());
}
(Err(symbols), Ok((huffman, bytes))) => {
bytes.extend(huffman.encode(symbols.iter()));
self.offsets.push(bytes.len());
}
(Err(symbols), Err(raw)) => {
raw.extend(symbols.iter().cloned());
self.offsets.push(raw.len());
}
}
}
}
impl<B: Ord + Clone + 'static> BatchContainer for HuffmanContainer<B> {
type Owned = Vec<B>;
type ReadItem<'a> = Wrapped<'a, B>;
fn reborrow<'b, 'a: 'b>(item: Self::ReadItem<'a>) -> Self::ReadItem<'b> { item }
fn with_capacity(size: usize) -> Self {
let mut offsets = OffsetList::with_capacity(size + 1);
offsets.push(0);
Self {
inner: Err(Vec::with_capacity(size)),
offsets,
stats: Default::default(),
}
}
fn merge_capacity(cont1: &Self, cont2: &Self) -> Self {
if cont1.len() > 0 { cont1.print(); }
if cont2.len() > 0 { cont2.print(); }
let mut counts = BTreeMap::default();
for (symbol, count) in cont1.stats.iter() {
*counts.entry(symbol.clone()).or_insert(0) += count;
}
for (symbol, count) in cont2.stats.iter() {
*counts.entry(symbol.clone()).or_insert(0) += count;
}
let bytes = Vec::with_capacity(counts.values().cloned().sum::<i64>() as usize);
let huffman = Huffman::create_from(counts);
let inner = Ok((huffman, bytes));
let length = cont1.offsets.len() + cont2.offsets.len() - 2;
let mut offsets = OffsetList::with_capacity(length + 1);
offsets.push(0);
Self {
inner,
offsets,
stats: Default::default(),
}
}
fn index(&self, index: usize) -> Self::ReadItem<'_> {
let lower = self.offsets.index(index);
let upper = self.offsets.index(index+1);
match &self.inner {
Ok((huffman, bytes)) => Wrapped::encoded(Encoded::new(huffman, &bytes[lower .. upper])),
Err(raw) => Wrapped::decoded(&raw[lower .. upper]),
}
}
fn len(&self) -> usize {
self.offsets.len() - 1
}
}
impl<B: Ord+Clone> Default for HuffmanContainer<B> {
fn default() -> Self {
let mut offsets = OffsetList::with_capacity(1);
offsets.push(0);
Self {
inner: Err(Vec::new()),
offsets,
stats: Default::default(),
}
}
}
mod wrapper {
use crate::trace::IntoOwned;
use super::Encoded;
pub struct Wrapped<'a, B: Ord> {
inner: Result<Encoded<'a, B>, &'a [B]>,
}
impl<'a, B: Ord> Wrapped<'a, B> {
pub fn decode(&'a self) -> Result<impl Iterator<Item=&'a B> + 'a, &'a [B]> {
match &self.inner {
Ok(encoded) => Ok(encoded.decode()),
Err(symbols) => Err(symbols),
}
}
pub fn encoded(e: Encoded<'a, B>) -> Self { Self { inner: Ok(e) } }
pub fn decoded(d: &'a [B]) -> Self { Self { inner: Err(d) } }
}
impl<'a, B: Ord> Copy for Wrapped<'a, B> { }
impl<'a, B: Ord> Clone for Wrapped<'a, B> {
fn clone(&self) -> Self { *self }
}
use std::cmp::Ordering;
impl<'a, 'b, B: Ord> PartialEq<Wrapped<'a, B>> for Wrapped<'b, B> {
fn eq(&self, other: &Wrapped<'a, B>) -> bool {
match (self.decode(), other.decode()) {
(Ok(decode1), Ok(decode2)) => decode1.eq(decode2),
(Ok(decode1), Err(bytes2)) => decode1.eq(bytes2.iter()),
(Err(bytes1), Ok(decode2)) => bytes1.iter().eq(decode2),
(Err(bytes1), Err(bytes2)) => bytes1.eq(bytes2),
}
}
}
impl<'a, B: Ord> Eq for Wrapped<'a, B> { }
impl<'a, 'b, B: Ord> PartialOrd<Wrapped<'a, B>> for Wrapped<'b, B> {
fn partial_cmp(&self, other: &Wrapped<'a, B>) -> Option<Ordering> {
match (self.decode(), other.decode()) {
(Ok(decode1), Ok(decode2)) => decode1.partial_cmp(decode2),
(Ok(decode1), Err(bytes2)) => decode1.partial_cmp(bytes2.iter()),
(Err(bytes1), Ok(decode2)) => bytes1.iter().partial_cmp(decode2),
(Err(bytes1), Err(bytes2)) => bytes1.partial_cmp(bytes2),
}
}
}
impl<'a, B: Ord> Ord for Wrapped<'a, B> {
fn cmp(&self, other: &Self) -> Ordering {
self.partial_cmp(other).unwrap()
}
}
impl<'a, B: Ord+Clone> IntoOwned<'a> for Wrapped<'a, B> {
type Owned = Vec<B>;
fn into_owned(self) -> Self::Owned {
match self.decode() {
Ok(decode) => decode.cloned().collect(),
Err(bytes) => bytes.to_vec(),
}
}
fn clone_onto(self, other: &mut Self::Owned) {
other.clear();
match self.decode() {
Ok(decode) => other.extend(decode.cloned()),
Err(bytes) => other.extend_from_slice(bytes),
}
}
fn borrow_as(owned: &'a Self::Owned) -> Self {
Self { inner: Err(&owned[..]) }
}
}
}
mod encoded {
use super::Huffman;
pub struct Encoded<'a, B: Ord> {
huffman: &'a Huffman<B>,
bytes: &'a [u8],
}
impl<'a, B: Ord> Encoded<'a, B> {
pub fn decode(&'a self) -> impl Iterator<Item=&'a B> + 'a {
self.huffman.decode(self.bytes.iter().cloned())
}
pub fn new(huffman: &'a Huffman<B>, bytes: &'a [u8]) -> Self {
Self { huffman, bytes }
}
}
impl<'a, B: Ord> Copy for Encoded<'a, B> { }
impl<'a, B: Ord> Clone for Encoded<'a, B> {
fn clone(&self) -> Self { *self }
}
}
mod huffman {
use std::collections::BTreeMap;
use std::convert::TryInto;
use self::decoder::Decoder;
use self::encoder::Encoder;
pub struct Huffman<T: Ord> {
encode: BTreeMap<T, (usize, u64)>,
decode: [Decode<T>; 256],
}
impl<T: Ord> Huffman<T> {
pub fn encode<'a, I>(&'a self, symbols: I) -> Encoder<'a, T, I::IntoIter>
where
I: IntoIterator<Item = &'a T>,
{
Encoder::new(&self.encode, symbols.into_iter())
}
pub fn decode<I>(&self, bytes: I) -> Decoder<'_, T, I::IntoIter>
where
I: IntoIterator<Item=u8>
{
Decoder::new(&self.decode, bytes.into_iter())
}
pub fn create_from(counts: BTreeMap<T, i64>) -> Self where T: Clone {
if counts.is_empty() {
return Self {
encode: Default::default(),
decode: Decode::map(),
};
}
let mut heap = std::collections::BinaryHeap::new();
for (item, count) in counts {
heap.push((-count, Node::Leaf(item)));
}
let mut tree = Vec::with_capacity(2 * heap.len() - 1);
while heap.len() > 1 {
let (count1, least1) = heap.pop().unwrap();
let (count2, least2) = heap.pop().unwrap();
let fork = Node::Fork(tree.len(), tree.len()+1);
tree.push(least1);
tree.push(least2);
heap.push((count1 + count2, fork));
}
tree.push(heap.pop().unwrap().1);
let mut levels = Vec::with_capacity(1 + tree.len()/2);
let mut todo = vec![(tree.last().unwrap(), 0)];
while let Some((node, level)) = todo.pop() {
match node {
Node::Leaf(sym) => { levels.push((level, sym)); },
Node::Fork(l,r) => {
todo.push((&tree[*l], level + 1));
todo.push((&tree[*r], level + 1));
},
}
}
levels.sort_by(|x,y| x.0.cmp(&y.0));
let mut code: u64 = 0;
let mut prev_level = 0;
let mut encode = BTreeMap::new();
let mut decode = Decode::map();
for (level, sym) in levels {
if prev_level != level {
code <<= level - prev_level;
prev_level = level;
}
encode.insert(sym.clone(), (level, code));
Self::insert_decode(&mut decode, sym, level, code << (64-level));
code += 1;
}
for (index, entry) in decode.iter().enumerate() {
if entry.any_void() {
panic!("VOID FOUND: {:?}", index);
}
}
Huffman {
encode,
decode,
}
}
fn insert_decode(map: &mut [Decode<T>; 256], symbol: &T, bits: usize, code: u64) where T: Clone {
let byte: u8 = (code >> 56).try_into().unwrap();
if bits <= 8 {
for off in 0 .. (1 << (8 - bits)) {
map[(byte as usize) + off] = Decode::Symbol(symbol.clone(), bits);
}
}
else {
if let Decode::Void = &map[byte as usize] {
map[byte as usize] = Decode::Further(Box::new(Decode::map()));
}
if let Decode::Further(next_map) = &mut map[byte as usize] {
Self::insert_decode(next_map, symbol, bits - 8, code << 8);
}
}
}
}
#[derive(Eq, PartialEq, Ord, PartialOrd, Debug)]
enum Node<T> {
Leaf(T),
Fork(usize, usize),
}
#[derive(Eq, PartialEq, Ord, PartialOrd, Debug, Default)]
pub enum Decode<T> {
#[default]
Void,
Symbol(T, usize),
Further(Box<[Decode<T>; 256]>),
}
impl<T> Decode<T> {
fn any_void(&self) -> bool {
match self {
Decode::Void => true,
Decode::Symbol(_,_) => false,
Decode::Further(map) => map.iter().any(|m| m.any_void()),
}
}
fn map() -> [Decode<T>; 256] {
let mut vec = Vec::with_capacity(256);
for _ in 0 .. 256 {
vec.push(Decode::Void);
}
vec.try_into().ok().unwrap()
}
}
mod decoder {
use super::Decode;
#[derive(Copy, Clone)]
pub struct Decoder<'a, T, I> {
decode: &'a [Decode<T>; 256],
bytes: I,
pending_byte: u16,
pending_bits: usize,
}
impl<'a, T, I> Decoder<'a, T, I> {
pub fn new(decode: &'a [Decode<T>; 256], bytes: I) -> Self {
Self {
decode,
bytes,
pending_byte: 0,
pending_bits: 0,
}
}
}
impl<'a, T, I> Iterator for Decoder<'a, T, I>
where
I: Iterator<Item=u8>,
{
type Item = &'a T;
fn next(&mut self) -> Option<&'a T> {
let mut map = self.decode;
loop {
if self.pending_bits < 8 {
if let Some(next_byte) = self.bytes.next() {
self.pending_byte = (self.pending_byte << 8) + next_byte as u16;
self.pending_bits += 8;
}
else {
return None;
}
}
let byte = (self.pending_byte >> (self.pending_bits - 8)) as usize;
match &map[byte] {
Decode::Void => { panic!("invalid decoding map"); }
Decode::Symbol(s, bits) => {
self.pending_bits -= bits;
self.pending_byte &= (1 << self.pending_bits) - 1;
return Some(s);
}
Decode::Further(next_map) => {
self.pending_bits -= 8;
self.pending_byte &= (1 << self.pending_bits) - 1;
map = next_map;
}
}
}
}
}
}
mod encoder {
use std::collections::BTreeMap;
#[derive(Copy, Clone)]
pub struct Encoder<'a, T, I> {
encode: &'a BTreeMap<T, (usize, u64)>,
symbols: I,
pending_byte: u64,
pending_bits: usize,
}
impl<'a, T, I> Encoder<'a, T, I> {
pub fn new(encode: &'a BTreeMap<T, (usize, u64)>, symbols: I) -> Self {
Self {
encode,
symbols,
pending_byte: 0,
pending_bits: 0,
}
}
}
impl<'a, T: Ord, I> Iterator for Encoder<'a, T, I>
where
I: Iterator<Item=&'a T>,
{
type Item = u8;
fn next(&mut self) -> Option<u8> {
while self.pending_bits < 8 {
if let Some(symbol) = self.symbols.next() {
let (bits, code) = self.encode.get(symbol).unwrap();
self.pending_byte <<= bits;
self.pending_byte += code;
self.pending_bits += bits;
}
else {
if self.pending_bits > 0 {
let byte = self.pending_byte << (8 - self.pending_bits);
self.pending_bits = 0;
self.pending_byte = 0;
return Some(byte as u8);
}
else {
return None;
}
}
}
let byte = self.pending_byte >> (self.pending_bits - 8);
self.pending_bits -= 8;
self.pending_byte &= (1 << self.pending_bits) - 1;
Some(byte as u8)
}
}
}
}