convert_case/
segmentation.rs

#[cfg(test)]
use strum_macros::EnumIter;

use unicode_segmentation::{UnicodeSegmentation}; //, GraphemeCursor};

/// A boundary defines how a string is split into words.  Some boundaries, `Hyphen`, `Underscore`,
/// and `Space`, consume the character they split on, whereas the other boundaries
/// do not.
///
/// The struct offers methods that return `Vec`s containing useful groups of boundaries.  It also
/// contains the [`list_from`](Boundary::list_from) method which will generate a list of boundaries
/// based on a string slice.
///
/// Note that all boundaries are distinct and do not share functionality.  That is, there is no
/// such DigitLetter variant, because that would be equivalent to the current `DigitUpper` and
/// `DigitLower` variants.  For common functionality, consider using
/// some provided functions that return a list of boundaries.
/// ```
/// use convert_case::{Boundary, Case, Casing, Converter};
///
/// assert_eq!(
///     "transformations_in_3d",
///     "TransformationsIn3D"
///         .from_case(Case::Camel)
///         .without_boundaries(&Boundary::digit_letter())
///         .to_case(Case::Snake)
/// );
///
/// let conv = Converter::new()
///     .set_boundaries(&Boundary::list_from("aA "))
///     .to_case(Case::Title);
/// assert_eq!("7empest By Tool", conv.convert("7empest byTool"));
/// ```
#[cfg_attr(test, derive(EnumIter))]
#[derive(Clone, Copy, Eq, PartialEq, Debug)]
pub enum Boundary {
    /// Splits on `-`, consuming the character on segmentation.
    /// ```
    /// use convert_case::Boundary;
    /// assert_eq!(
    ///     vec![Boundary::Hyphen],
    ///     Boundary::list_from("-")
    /// );
    /// ```
    Hyphen,

    /// Splits on `_`, consuming the character on segmentation.
    /// ```
    /// use convert_case::Boundary;
    /// assert_eq!(
    ///     vec![Boundary::Underscore],
    ///     Boundary::list_from("_")
    /// );
    /// ```
    Underscore,

    /// Splits on space, consuming the character on segmentation.
    /// ```
    /// use convert_case::Boundary;
    /// assert_eq!(
    ///     vec![Boundary::Space],
    ///     Boundary::list_from(" ")
    /// );
    /// ```
    Space,

    /// Splits where an uppercase letter is followed by a lowercase letter.  This is seldom used,
    /// and is not included in the [defaults](Boundary::defaults).
    /// ```
    /// use convert_case::Boundary;
    /// assert_eq!(
    ///     vec![Boundary::UpperLower],
    ///     Boundary::list_from("Aa")
    /// );
    /// ```
    UpperLower,

    /// Splits where a lowercase letter is followed by an uppercase letter.
    /// ```
    /// use convert_case::Boundary;
    /// assert_eq!(
    ///     vec![Boundary::LowerUpper],
    ///     Boundary::list_from("aA")
    /// );
    /// ```
    LowerUpper,

    /// Splits where digit is followed by an uppercase letter.
    /// ```
    /// use convert_case::Boundary;
    /// assert_eq!(
    ///     vec![Boundary::DigitUpper],
    ///     Boundary::list_from("1A")
    /// );
    /// ```
    DigitUpper,

    /// Splits where an uppercase letter is followed by a digit.
    /// ```
    /// use convert_case::Boundary;
    /// assert_eq!(
    ///     vec![Boundary::UpperDigit],
    ///     Boundary::list_from("A1")
    /// );
    /// ```
    UpperDigit,

    /// Splits where digit is followed by a lowercase letter.
    /// ```
    /// use convert_case::Boundary;
    /// assert_eq!(
    ///     vec![Boundary::DigitLower],
    ///     Boundary::list_from("1a")
    /// );
    /// ```
    DigitLower,

    /// Splits where a lowercase letter is followed by a digit.
    /// ```
    /// use convert_case::Boundary;
    /// assert_eq!(
    ///     vec![Boundary::LowerDigit],
    ///     Boundary::list_from("a1")
    /// );
    /// ```
    LowerDigit,

    /// Acronyms are identified by two uppercase letters followed by a lowercase letter.
    /// The word boundary is between the two uppercase letters.  For example, "HTTPRequest"
    /// would have an acronym boundary identified at "PRe" and split into "HTTP" and "Request".
    /// ```
    /// use convert_case::Boundary;
    /// assert_eq!(
    ///     vec![Boundary::Acronym],
    ///     Boundary::list_from("AAa")
    /// );
    /// ```
    Acronym,
}

impl Boundary {
    /// Returns a list of all boundaries that are identified within the given string.
    /// Could be a short of writing out all the boundaries in a list directly.  This will not
    /// identify boundary `UpperLower` if it also used as part of `Acronym`.
    ///
    /// If you want to be very explicit and not overlap boundaries, it is recommended to use a colon
    /// character.
    /// ```
    /// use convert_case::Boundary;
    /// use Boundary::*;
    /// assert_eq!(
    ///     vec![Hyphen, Space, LowerUpper, UpperDigit, DigitLower],
    ///     Boundary::list_from("aA8a -")
    /// );
    /// assert_eq!(
    ///     vec![Underscore, LowerUpper, DigitUpper, Acronym],
    ///     Boundary::list_from("bD:0B:_:AAa")
    /// );
    /// ```
    pub fn list_from(s: &str) -> Vec<Self> {
        Boundary::all().iter().filter(|boundary| {
            let left_iter = s.graphemes(true);
            let mid_iter = s.graphemes(true).skip(1);
            let right_iter = s.graphemes(true).skip(2);

            let mut one_iter = left_iter.clone();

            // Also capture when the previous pair was both uppercase, so we don't
            // match the UpperLower boundary in the case of Acronym
            let two_iter = left_iter.clone().zip(mid_iter.clone());
            let mut two_iter_and_upper = two_iter.clone()
                .zip(std::iter::once(false).chain(
                        two_iter.map(|(a, b)| grapheme_is_uppercase(a) && grapheme_is_uppercase(b))
                ));

            let mut three_iter = left_iter.zip(mid_iter).zip(right_iter);

            one_iter.any(|a| boundary.detect_one(a))
                || two_iter_and_upper.any(|((a, b), is_acro)| boundary.detect_two(a, b) && !is_acro)
                || three_iter.any(|((a, b), c)| boundary.detect_three(a, b, c))
        }).copied().collect()
    }

    /// The default list of boundaries used when `Casing::to_case` is called directly
    /// and in a `Converter` generated from `Converter::new()`.  This includes
    /// all the boundaries except the `UpperLower` boundary.
    /// ```
    /// use convert_case::Boundary;
    /// use Boundary::*;
    /// assert_eq!(
    ///     vec![
    ///         Underscore, Hyphen, Space, LowerUpper, UpperDigit, 
    ///         DigitUpper, DigitLower, LowerDigit, Acronym,
    ///     ],
    ///     Boundary::defaults()
    /// );
    /// ```
    pub fn defaults() -> Vec<Self> {
        use Boundary::*;
        vec![
            Underscore, Hyphen, Space, LowerUpper, UpperDigit, DigitUpper, DigitLower, LowerDigit,
            Acronym,
        ]
    }

    /// Returns the boundaries that split around single characters: `Hyphen`,
    /// `Underscore`, and `Space`.
    /// ```
    /// use convert_case::Boundary;
    /// use Boundary::*;
    /// assert_eq!(
    ///     vec![Hyphen, Underscore, Space],
    ///     Boundary::delims()
    /// );
    /// ```
    pub fn delims() -> Vec<Self> {
        use Boundary::*;
        vec![Hyphen, Underscore, Space]
    }

    /// Returns the boundaries that involve digits: `DigitUpper`, `DigitLower`, `UpperDigit`, and
    /// `LowerDigit`.
    /// ```
    /// use convert_case::Boundary;
    /// use Boundary::*;
    /// assert_eq!(
    ///     vec![DigitUpper, UpperDigit, DigitLower, LowerDigit],
    ///     Boundary::digits()
    /// );
    /// ```
    pub fn digits() -> Vec<Self> {
        use Boundary::*;
        vec![DigitUpper, UpperDigit, DigitLower, LowerDigit]
    }

    /// Returns the boundaries that are letters followed by digits: `UpperDigit` and `LowerDigit`.
    /// ```
    /// use convert_case::Boundary;
    /// use Boundary::*;
    /// assert_eq!(
    ///     vec![UpperDigit, LowerDigit],
    ///     Boundary::letter_digit()
    /// );
    /// ```
    pub fn letter_digit() -> Vec<Self> {
        use Boundary::*;
        vec![UpperDigit, LowerDigit]
    }

    /// Returns the boundaries that are digits followed by letters: `DigitUpper` and
    /// `DigitLower`.
    /// ```
    /// use convert_case::Boundary;
    /// use Boundary::*;
    /// assert_eq!(
    ///     vec![DigitUpper, DigitLower],
    ///     Boundary::digit_letter()
    /// );
    /// ```
    pub fn digit_letter() -> Vec<Self> {
        use Boundary::*;
        vec![DigitUpper, DigitLower]
    }

    /// Returns all boundaries.  Note that this includes the `UpperLower` variant which
    /// might be unhelpful.  Please look at [`Boundary::defaults`].
    /// ```
    /// use convert_case::Boundary;
    /// use Boundary::*;
    /// assert_eq!(
    ///     vec![
    ///         Hyphen, Underscore, Space, LowerUpper, UpperLower, DigitUpper,
    ///         UpperDigit, DigitLower, LowerDigit, Acronym,
    ///     ],
    ///     Boundary::all()
    /// );
    /// ```
    pub fn all() -> Vec<Self> {
        use Boundary::*;
        vec![
            Hyphen, Underscore, Space, LowerUpper, UpperLower, DigitUpper, UpperDigit, 
            DigitLower, LowerDigit, Acronym
        ]
    }

    fn detect_one(&self, c: &str) -> bool {
        use Boundary::*;
        match self {
            Hyphen => c == "-",
            Underscore => c == "_",
            Space => c == " ",
            _ => false,
        }
    }

    fn detect_two(&self, c: &str, d: &str) -> bool {
        use Boundary::*;
        match self {
            UpperLower => grapheme_is_uppercase(c) && grapheme_is_lowercase(d),
            LowerUpper => grapheme_is_lowercase(c) && grapheme_is_uppercase(d),
            DigitUpper => grapheme_is_digit(c) && grapheme_is_uppercase(d),
            UpperDigit => grapheme_is_uppercase(c) && grapheme_is_digit(d),
            DigitLower => grapheme_is_digit(c) && grapheme_is_lowercase(d),
            LowerDigit => grapheme_is_lowercase(c) && grapheme_is_digit(d),
            _ => false,
        }
    }

    fn detect_three(&self, c: &str, d: &str, e: &str) -> bool {
        use Boundary::*;
        if let Acronym = self {
            grapheme_is_uppercase(c)
                && grapheme_is_uppercase(d)
                && grapheme_is_lowercase(e)
        } else {
            false
        }
    }
}

fn grapheme_is_digit(c: &str) -> bool {
    c.chars().all(|c| c.is_ascii_digit())
}

fn grapheme_is_uppercase(c: &str) -> bool {
    c.to_uppercase() != c.to_lowercase() && c == c.to_uppercase()
}

fn grapheme_is_lowercase(c: &str) -> bool {
    c.to_uppercase() != c.to_lowercase() && c == c.to_lowercase()
}

pub fn split<T>(s: T, boundaries: &[Boundary]) -> Vec<String>
where
    T: AsRef<str>,
{
    use std::iter::once;
    // create split_points function that counts off by graphemes into list
    
    let s = s.as_ref();

    // Some<bool> means the following
    // None: no split
    // Some(false): split between characters
    // Some(true): split consuming characters

    let left_iter = s.graphemes(true);
    let mid_iter = s.graphemes(true).skip(1);
    let right_iter = s.graphemes(true).skip(2);

    let singles = left_iter.clone();
    let doubles = left_iter.clone().zip(mid_iter.clone());
    let triples = left_iter.zip(mid_iter).zip(right_iter);

    let singles = singles
        .map(|c| boundaries.iter().any(|b| b.detect_one(c)))
        .map(|split| if split {Some(true)} else {None});
    let doubles = doubles
        .map(|(c,d)| boundaries.iter().any(|b| b.detect_two(c, d)))
        .map(|split| if split {Some(false)} else {None});
    let triples = triples
        .map(|((c,d),e)| boundaries.iter().any(|b| b.detect_three(c, d, e)))
        .map(|split| if split {Some(false)} else {None});

    let split_points = singles
        .zip(once(None).chain(doubles))
        .zip(once(None).chain(triples).chain(once(None)))
        .map(|((s, d), t)| s.or(d).or(t));

    let mut words = Vec::new();
    let mut word = String::new();
    for (c, split) in s.graphemes(true).zip(split_points) {
        match split {
            // no split here
            None => word.push_str(c),
            // split here, consume letter
            Some(true) => words.push(std::mem::take(&mut word)),
            // split here, keep letter
            Some(false) => {
                words.push(std::mem::take(&mut word));
                word.push_str(c);
            }
        }
    }
    words.push(word);

    /*
    let mut words = Vec::new();
    let mut left_idx = 0;
    let mut total_chars = 0;
    let mut skip = 0;
    let mut cur = GraphemeCursor::new(left_idx, s.len(), true);

    for (right_idx, split) in split_points.enumerate() {
        match split {
            // no split here
            None => {},
            // split here, consume letter
            Some(true) => {
                let mut right_bound = left_bound;
                for _ in 0..total_chars {
                    right_bound = cur.next_boundary(s, skip).unwrap().unwrap();
                }
                words.push(&s[left_bound..right_bound])
            }
            // split here, keep letter
            Some(false) => {
            }
            // dont push an empty string, do nothing
            _ => {}
        }
    }
    */

    words.into_iter().filter(|s| !s.is_empty()).collect()
}

#[cfg(test)]
mod test {
    use super::*;
    use strum::IntoEnumIterator;

    #[test]
    fn all_boundaries_in_iter() {
        let all = Boundary::all();
        for boundary in Boundary::iter() {
            assert!(all.contains(&boundary));
        }
    }

    #[test]
    fn split_on_delims() {
        assert_eq!(
            vec!["my", "word", "list", "separated", "by", "delims"],
            split("my_word-list separated-by_delims", &Boundary::delims())
        )
    }

    #[test]
    fn boundaries_found_in_string() {
        use Boundary::*;
        assert_eq!(
            vec![UpperLower],
            Boundary::list_from(".Aaaa")
        );
        assert_eq!(
            vec![LowerUpper, UpperLower, LowerDigit],
            Boundary::list_from("a8.Aa.aA")
        );
        assert_eq!(
            Boundary::digits(),
            Boundary::list_from("b1B1b")
        );
        assert_eq!(
            vec![Hyphen, Underscore, Space, Acronym],
            Boundary::list_from("AAa -_")
        );
    }
}
convert_case/segmentation.rs

convert_case/
segmentation.rs