@@ -447,6 +447,14 @@ impl Hir {
447447 . map ( |b| ClassBytesRange { start : b, end : b } ) ;
448448 return Hir :: class ( Class :: Bytes ( ClassBytes :: new ( it) ) ) ;
449449 }
450+ // Similar to singleton chars, we can also look for alternations of
451+ // classes. Those can be smushed into a single class.
452+ if let Some ( cls) = class_chars ( & new) {
453+ return Hir :: class ( cls) ;
454+ }
455+ if let Some ( cls) = class_bytes ( & new) {
456+ return Hir :: class ( cls) ;
457+ }
450458 let props = Properties :: alternation ( & new) ;
451459 Hir { kind : HirKind :: Alternation ( new) , props }
452460 }
@@ -854,6 +862,23 @@ impl ClassUnicode {
854862 None
855863 }
856864 }
865+
866+ /// If this class consists of only ASCII ranges, then return its
867+ /// corresponding and equivalent byte class.
868+ pub fn to_byte_class ( & self ) -> Option < ClassBytes > {
869+ if !self . is_all_ascii ( ) {
870+ return None ;
871+ }
872+ Some ( ClassBytes :: new ( self . ranges ( ) . iter ( ) . map ( |r| {
873+ // Since we are guaranteed that our codepoint range is ASCII, the
874+ // 'u8::try_from' calls below are guaranteed to be correct.
875+ ClassBytesRange {
876+ // MSRV(1.59): Use 'u8::try_from(c)' instead.
877+ start : u8:: try_from ( u32:: from ( r. start ) ) . unwrap ( ) ,
878+ end : u8:: try_from ( u32:: from ( r. end ) ) . unwrap ( ) ,
879+ }
880+ } ) ) )
881+ }
857882}
858883
859884/// An iterator over all ranges in a Unicode character class.
@@ -1120,6 +1145,23 @@ impl ClassBytes {
11201145 None
11211146 }
11221147 }
1148+
1149+ /// If this class consists of only ASCII ranges, then return its
1150+ /// corresponding and equivalent Unicode class.
1151+ pub fn to_unicode_class ( & self ) -> Option < ClassUnicode > {
1152+ if !self . is_all_ascii ( ) {
1153+ return None ;
1154+ }
1155+ Some ( ClassUnicode :: new ( self . ranges ( ) . iter ( ) . map ( |r| {
1156+ // Since we are guaranteed that our byte range is ASCII, the
1157+ // 'char::from' calls below are correct and will not erroneously
1158+ // convert a raw byte value into its corresponding codepoint.
1159+ ClassUnicodeRange {
1160+ start : char:: from ( r. start ) ,
1161+ end : char:: from ( r. end ) ,
1162+ }
1163+ } ) ) )
1164+ }
11231165}
11241166
11251167/// An iterator over all ranges in a byte character class.
@@ -1936,6 +1978,44 @@ impl Iterator for LookSetIter {
19361978 }
19371979}
19381980
1981+ /// Given a sequence of HIR values where each value corresponds to a Unicode
1982+ /// class (or an all-ASCII byte class), return a single Unicode class
1983+ /// corresponding to the union of the classes found.
1984+ fn class_chars ( hirs : & [ Hir ] ) -> Option < Class > {
1985+ let mut cls = ClassUnicode :: new ( vec ! [ ] ) ;
1986+ for hir in hirs. iter ( ) {
1987+ match * hir. kind ( ) {
1988+ HirKind :: Class ( Class :: Unicode ( ref cls2) ) => {
1989+ cls. union ( cls2) ;
1990+ }
1991+ HirKind :: Class ( Class :: Bytes ( ref cls2) ) => {
1992+ cls. union ( & cls2. to_unicode_class ( ) ?) ;
1993+ }
1994+ _ => return None ,
1995+ } ;
1996+ }
1997+ Some ( Class :: Unicode ( cls) )
1998+ }
1999+
2000+ /// Given a sequence of HIR values where each value corresponds to a byte class
2001+ /// (or an all-ASCII Unicode class), return a single byte class corresponding
2002+ /// to the union of the classes found.
2003+ fn class_bytes ( hirs : & [ Hir ] ) -> Option < Class > {
2004+ let mut cls = ClassBytes :: new ( vec ! [ ] ) ;
2005+ for hir in hirs. iter ( ) {
2006+ match * hir. kind ( ) {
2007+ HirKind :: Class ( Class :: Unicode ( ref cls2) ) => {
2008+ cls. union ( & cls2. to_byte_class ( ) ?) ;
2009+ }
2010+ HirKind :: Class ( Class :: Bytes ( ref cls2) ) => {
2011+ cls. union ( cls2) ;
2012+ }
2013+ _ => return None ,
2014+ } ;
2015+ }
2016+ Some ( Class :: Bytes ( cls) )
2017+ }
2018+
19392019/// Given a sequence of HIR values where each value corresponds to a literal
19402020/// that is a single `char`, return that sequence of `char`s. Otherwise return
19412021/// None. No deduplication is done.
0 commit comments