Skip to content

Commit

Permalink
syntax: unbox Ast and remove AstKind
Browse files Browse the repository at this point in the history
The AstKind experiment proved unfruitful. I think the issue here is that
the savings on Vec<Ast> didn't prove to be enough to offset the extra
heap allocation that resulted from the indirection.

This seems to be a sweet spot. It would be nice to get Ast down below 16
bytes, but it's not clear how to do that (without much larger changes
that I don't feel inclined to pursue).

Fixes #1090
  • Loading branch information
BurntSushi committed Oct 9, 2023
1 parent 31b4398 commit 17d9c1c
Show file tree
Hide file tree
Showing 7 changed files with 161 additions and 190 deletions.
21 changes: 11 additions & 10 deletions fuzz/fuzz_targets/ast_roundtrip.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
use {
libfuzzer_sys::{fuzz_target, Corpus},
regex_syntax::ast::{
parse::Parser, visit, Ast, Flag, Group, GroupKind, SetFlags, Visitor,
parse::Parser, visit, Ast, Flag, Flags, GroupKind, Visitor,
},
};

Expand Down Expand Up @@ -32,16 +32,17 @@ impl Visitor for VerboseVisitor {
}

fn visit_pre(&mut self, ast: &Ast) -> Result<Self::Output, Self::Err> {
let reject_flags = |flags: &Flags| {
flags.flag_state(Flag::IgnoreWhitespace).unwrap_or(false)
};
match ast {
Ast::Flags(SetFlags { flags, .. })
| Ast::Group(Group {
kind: GroupKind::NonCapturing(flags), ..
}) if flags
.flag_state(Flag::IgnoreWhitespace)
.unwrap_or(false) =>
{
Err(())
}
Ast::Flags(x) if reject_flags(&x.flags) => return Err(()),
Ast::Group(x) => match x.kind {
GroupKind::NonCapturing(ref flags) if reject_flags(flags) => {
return Err(())
}
_ => Ok(()),
},
_ => Ok(()),
}
}
Expand Down
4 changes: 3 additions & 1 deletion regex-cli/cmd/generate/fowler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,9 @@ fn count_capturing_groups_ast(ast: &regex_syntax::ast::Ast) -> usize {
| Ast::Literal(_)
| Ast::Dot(_)
| Ast::Assertion(_)
| Ast::Class(_) => 0,
| Ast::ClassUnicode(_)
| Ast::ClassPerl(_)
| Ast::ClassBracketed(_) => 0,
Ast::Repetition(ref rep) => count_capturing_groups_ast(&*rep.ast),
Ast::Group(ref group) => {
let this = if group.is_capturing() { 1 } else { 0 };
Expand Down
168 changes: 68 additions & 100 deletions regex-syntax/src/ast/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -429,19 +429,9 @@ pub struct Comment {
///
/// This type defines its own destructor that uses constant stack space and
/// heap space proportional to the size of the `Ast`.
///
/// This type boxes the actual kind of the AST element so that an `Ast` value
/// itself has a very small size. This in turn makes things like `Vec<Ast>` use
/// a lot less memory than it might otherwise, which is particularly beneficial
/// for representing long concatenations or alternations.
#[derive(Clone, Debug, Eq, PartialEq)]
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
pub struct Ast(pub Box<AstKind>);

/// The kind of an abstract syntax element.
#[derive(Clone, Debug, Eq, PartialEq)]
#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
pub enum AstKind {
pub enum Ast {
/// An empty regex that matches everything.
Empty(Box<Span>),
/// A set of flags, e.g., `(?is)`.
Expand Down Expand Up @@ -473,106 +463,106 @@ pub enum AstKind {
impl Ast {
/// Create an "empty" AST item.
pub fn empty(span: Span) -> Ast {
Ast(Box::new(AstKind::Empty(Box::new(span))))
Ast::Empty(Box::new(span))
}

/// Create a "flags" AST item.
pub fn flags(e: SetFlags) -> Ast {
Ast(Box::new(AstKind::Flags(Box::new(e))))
Ast::Flags(Box::new(e))
}

/// Create a "literal" AST item.
pub fn literal(e: Literal) -> Ast {
Ast(Box::new(AstKind::Literal(Box::new(e))))
Ast::Literal(Box::new(e))
}

/// Create a "dot" AST item.
pub fn dot(span: Span) -> Ast {
Ast(Box::new(AstKind::Dot(Box::new(span))))
Ast::Dot(Box::new(span))
}

/// Create a "assertion" AST item.
pub fn assertion(e: Assertion) -> Ast {
Ast(Box::new(AstKind::Assertion(Box::new(e))))
Ast::Assertion(Box::new(e))
}

/// Create a "Unicode class" AST item.
pub fn class_unicode(e: ClassUnicode) -> Ast {
Ast(Box::new(AstKind::ClassUnicode(Box::new(e))))
Ast::ClassUnicode(Box::new(e))
}

/// Create a "Perl class" AST item.
pub fn class_perl(e: ClassPerl) -> Ast {
Ast(Box::new(AstKind::ClassPerl(Box::new(e))))
Ast::ClassPerl(Box::new(e))
}

/// Create a "bracketed class" AST item.
pub fn class_bracketed(e: ClassBracketed) -> Ast {
Ast(Box::new(AstKind::ClassBracketed(Box::new(e))))
Ast::ClassBracketed(Box::new(e))
}

/// Create a "repetition" AST item.
pub fn repetition(e: Repetition) -> Ast {
Ast(Box::new(AstKind::Repetition(Box::new(e))))
Ast::Repetition(Box::new(e))
}

/// Create a "group" AST item.
pub fn group(e: Group) -> Ast {
Ast(Box::new(AstKind::Group(Box::new(e))))
Ast::Group(Box::new(e))
}

/// Create a "alternation" AST item.
pub fn alternation(e: Alternation) -> Ast {
Ast(Box::new(AstKind::Alternation(Box::new(e))))
Ast::Alternation(Box::new(e))
}

/// Create a "concat" AST item.
pub fn concat(e: Concat) -> Ast {
Ast(Box::new(AstKind::Concat(Box::new(e))))
Ast::Concat(Box::new(e))
}

/// Return the span of this abstract syntax tree.
pub fn span(&self) -> &Span {
match *self.0 {
AstKind::Empty(ref span) => span,
AstKind::Flags(ref x) => &x.span,
AstKind::Literal(ref x) => &x.span,
AstKind::Dot(ref span) => span,
AstKind::Assertion(ref x) => &x.span,
AstKind::ClassUnicode(ref x) => &x.span,
AstKind::ClassPerl(ref x) => &x.span,
AstKind::ClassBracketed(ref x) => &x.span,
AstKind::Repetition(ref x) => &x.span,
AstKind::Group(ref x) => &x.span,
AstKind::Alternation(ref x) => &x.span,
AstKind::Concat(ref x) => &x.span,
match *self {
Ast::Empty(ref span) => span,
Ast::Flags(ref x) => &x.span,
Ast::Literal(ref x) => &x.span,
Ast::Dot(ref span) => span,
Ast::Assertion(ref x) => &x.span,
Ast::ClassUnicode(ref x) => &x.span,
Ast::ClassPerl(ref x) => &x.span,
Ast::ClassBracketed(ref x) => &x.span,
Ast::Repetition(ref x) => &x.span,
Ast::Group(ref x) => &x.span,
Ast::Alternation(ref x) => &x.span,
Ast::Concat(ref x) => &x.span,
}
}

/// Return true if and only if this Ast is empty.
pub fn is_empty(&self) -> bool {
match *self.0 {
AstKind::Empty(_) => true,
match *self {
Ast::Empty(_) => true,
_ => false,
}
}

/// Returns true if and only if this AST has any (including possibly empty)
/// subexpressions.
fn has_subexprs(&self) -> bool {
match *self.0 {
AstKind::Empty(_)
| AstKind::Flags(_)
| AstKind::Literal(_)
| AstKind::Dot(_)
| AstKind::Assertion(_)
| AstKind::ClassUnicode(_)
| AstKind::ClassPerl(_) => false,
AstKind::ClassBracketed(_)
| AstKind::Repetition(_)
| AstKind::Group(_)
| AstKind::Alternation(_)
| AstKind::Concat(_) => true,
match *self {
Ast::Empty(_)
| Ast::Flags(_)
| Ast::Literal(_)
| Ast::Dot(_)
| Ast::Assertion(_)
| Ast::ClassUnicode(_)
| Ast::ClassPerl(_) => false,
Ast::ClassBracketed(_)
| Ast::Repetition(_)
| Ast::Group(_)
| Ast::Alternation(_)
| Ast::Concat(_) => true,
}
}
}
Expand Down Expand Up @@ -1598,48 +1588,48 @@ impl Drop for Ast {
fn drop(&mut self) {
use core::mem;

match *self.0 {
AstKind::Empty(_)
| AstKind::Flags(_)
| AstKind::Literal(_)
| AstKind::Dot(_)
| AstKind::Assertion(_)
| AstKind::ClassUnicode(_)
| AstKind::ClassPerl(_)
match *self {
Ast::Empty(_)
| Ast::Flags(_)
| Ast::Literal(_)
| Ast::Dot(_)
| Ast::Assertion(_)
| Ast::ClassUnicode(_)
| Ast::ClassPerl(_)
// Bracketed classes are recursive, they get their own Drop impl.
| AstKind::ClassBracketed(_) => return,
AstKind::Repetition(ref x) if !x.ast.has_subexprs() => return,
AstKind::Group(ref x) if !x.ast.has_subexprs() => return,
AstKind::Alternation(ref x) if x.asts.is_empty() => return,
AstKind::Concat(ref x) if x.asts.is_empty() => return,
| Ast::ClassBracketed(_) => return,
Ast::Repetition(ref x) if !x.ast.has_subexprs() => return,
Ast::Group(ref x) if !x.ast.has_subexprs() => return,
Ast::Alternation(ref x) if x.asts.is_empty() => return,
Ast::Concat(ref x) if x.asts.is_empty() => return,
_ => {}
}

let empty_span = || Span::splat(Position::new(0, 0, 0));
let empty_ast = || Ast::empty(empty_span());
let mut stack = vec![mem::replace(self, empty_ast())];
while let Some(mut ast) = stack.pop() {
match *ast.0 {
AstKind::Empty(_)
| AstKind::Flags(_)
| AstKind::Literal(_)
| AstKind::Dot(_)
| AstKind::Assertion(_)
| AstKind::ClassUnicode(_)
| AstKind::ClassPerl(_)
match ast {
Ast::Empty(_)
| Ast::Flags(_)
| Ast::Literal(_)
| Ast::Dot(_)
| Ast::Assertion(_)
| Ast::ClassUnicode(_)
| Ast::ClassPerl(_)
// Bracketed classes are recursive, so they get their own Drop
// impl.
| AstKind::ClassBracketed(_) => {}
AstKind::Repetition(ref mut x) => {
| Ast::ClassBracketed(_) => {}
Ast::Repetition(ref mut x) => {
stack.push(mem::replace(&mut x.ast, empty_ast()));
}
AstKind::Group(ref mut x) => {
Ast::Group(ref mut x) => {
stack.push(mem::replace(&mut x.ast, empty_ast()));
}
AstKind::Alternation(ref mut x) => {
Ast::Alternation(ref mut x) => {
stack.extend(x.asts.drain(..));
}
AstKind::Concat(ref mut x) => {
Ast::Concat(ref mut x) => {
stack.extend(x.asts.drain(..));
}
}
Expand Down Expand Up @@ -1760,35 +1750,13 @@ mod tests {
// 64-bit target. Wow.
#[test]
fn ast_size() {
std::dbg!(core::mem::size_of::<Span>());
std::dbg!(core::mem::size_of::<SetFlags>());
std::dbg!(core::mem::size_of::<Literal>());
std::dbg!(core::mem::size_of::<Span>());
std::dbg!(core::mem::size_of::<Assertion>());
std::dbg!(core::mem::size_of::<ClassUnicode>());
std::dbg!(core::mem::size_of::<ClassPerl>());
std::dbg!(core::mem::size_of::<ClassBracketed>());
std::dbg!(core::mem::size_of::<Repetition>());
std::dbg!(core::mem::size_of::<Group>());
std::dbg!(core::mem::size_of::<Alternation>());
std::dbg!(core::mem::size_of::<Concat>());

let max = core::mem::size_of::<usize>();
let max = 2 * core::mem::size_of::<usize>();
let size = core::mem::size_of::<Ast>();
assert!(
size <= max,
"Ast size of {} bytes is bigger than suggested max {}",
size,
max
);

let max = 2 * core::mem::size_of::<usize>();
let size = core::mem::size_of::<AstKind>();
assert!(
size <= max,
"AstKind size of {} bytes is bigger than suggested max {}",
size,
max
);
}
}
Loading

0 comments on commit 17d9c1c

Please sign in to comment.