Skip to content

Commit e944b2c

Browse files
committed
Refactor parser: Rename functions for clarity and improve tag handling with zero-copy parsing
1 parent 2309321 commit e944b2c

File tree

1 file changed

+72
-69
lines changed

1 file changed

+72
-69
lines changed

esi/src/parser.rs

Lines changed: 72 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ use nom::branch::alt;
33
// Using STREAMING parsers - they return Incomplete when they need more data
44
// This enables TRUE bounded-memory streaming
55
use nom::bytes::streaming::{
6-
tag, tag_no_case, take, take_until, take_while, take_while1, take_while_m_n,
6+
tag, tag_no_case, take, take_till, take_until, take_while, take_while1, take_while_m_n,
77
};
88
use nom::character::streaming::{alpha1, multispace0, multispace1};
99
use nom::combinator::{map, map_res, not, opt, peek, recognize};
@@ -245,7 +245,7 @@ fn esi_assign<'a>(
245245
alt((esi_assign_short, |i| esi_assign_long(original, i)))(input)
246246
}
247247

248-
fn parse_assign_attributes_short(attrs: Vec<(String, String)>) -> Vec<Element> {
248+
fn assign_attributes_short(attrs: Vec<(String, String)>) -> Vec<Element> {
249249
let mut name = String::new();
250250
let mut value_str = String::new();
251251
for (key, val) in attrs {
@@ -269,7 +269,7 @@ fn parse_assign_attributes_short(attrs: Vec<(String, String)>) -> Vec<Element> {
269269
vec![Element::Esi(Tag::Assign { name, value })]
270270
}
271271

272-
fn parse_assign_long(attrs: Vec<(String, String)>, content: Vec<Element>) -> Vec<Element> {
272+
fn assign_long(attrs: Vec<(String, String)>, content: Vec<Element>) -> Vec<Element> {
273273
let mut name = String::new();
274274
for (key, val) in attrs {
275275
if key == "name" {
@@ -316,7 +316,7 @@ fn esi_assign_short(input: &[u8]) -> IResult<&[u8], Vec<Element>, Error<&[u8]>>
316316
attributes,
317317
preceded(multispace0, self_closing),
318318
),
319-
parse_assign_attributes_short,
319+
assign_attributes_short,
320320
)(input)
321321
}
322322

@@ -334,7 +334,7 @@ fn esi_assign_long<'a>(
334334
|i| parse_interpolated(original, i),
335335
tag(b"</esi:assign>"),
336336
)),
337-
|(attrs, content, _)| parse_assign_long(attrs, content),
337+
|(attrs, content, _)| assign_long(attrs, content),
338338
)(input)
339339
}
340340

@@ -448,9 +448,7 @@ fn esi_when<'a>(
448448
)(input)
449449
}
450450

451-
// Removed - use parse_complete() directly for delimited content
452-
453-
// Zero-copy version used by both esi_tag and esi_tag_old (via parse_interpolated)
451+
/// Zero-copy parser for <esi:choose>...</esi:choose>
454452
fn esi_choose<'a>(
455453
original: &Bytes,
456454
input: &'a [u8],
@@ -731,6 +729,11 @@ fn single_quote(input: &[u8]) -> IResult<&[u8], &[u8], Error<&[u8]>> {
731729
tag(b"\'")(input)
732730
}
733731

732+
#[inline]
733+
fn is_closing_bracket(b: u8) -> bool {
734+
b == b'>'
735+
}
736+
734737
#[inline]
735738
fn is_double_quote(b: u8) -> bool {
736739
b == b'\"'
@@ -807,10 +810,14 @@ fn tag_handler<'a>(
807810
_ if name.eq_ignore_ascii_case(b"script") => html_script_tag(original, start),
808811

809812
// Regular HTML tag - continue parsing from where we left off
810-
// (we've already consumed `<tagname`, just need to find `>`)
811813
_ => {
812-
let (input, _) = take_until(b">".as_ref())(input)?;
814+
// we've already consumed `<tagname`, let's find `>`
815+
// Consume everything up to '>'
816+
let (input, _) = take_till(is_closing_bracket)(input)?;
817+
// Consume the '>' itself
813818
let (input, _) = closing_bracket(input)?;
819+
820+
// Calculate the full tag from start (includes `<tagname...>`)
814821
let full_tag = &start[..start.len() - input.len()];
815822

816823
Ok((
@@ -845,43 +852,35 @@ fn script_content(input: &[u8]) -> IResult<&[u8], &[u8], Error<&[u8]>> {
845852
}
846853

847854
/// script tag parser - input starts at <script
855+
/// Treats all script tags (inline and external) as HTML elements
848856
fn html_script_tag<'a>(
849857
original: &Bytes,
850858
input: &'a [u8],
851859
) -> IResult<&'a [u8], Vec<Element>, Error<&'a [u8]>> {
852860
let start = input;
853-
let (input, _) = tag_no_case(b"<script")(input)?;
854-
let (input, attrs) = attributes(input)?;
855-
let (input, _) = closing_bracket(input)?;
856-
let opening = &start[..start.len() - input.len()];
857-
858-
let has_src = attrs.iter().any(|(k, _)| k == "src");
859-
860-
if has_src {
861-
// External script - return just the opening tag as HTML
862-
return Ok((
863-
input,
864-
vec![Element::Html(slice_as_bytes(original, opening))],
865-
));
866-
}
867861

868-
// Inline script - find closing </script> tag (case insensitive)
869-
let (input, content) = script_content(input)?;
862+
// Parse opening tag
863+
let (input, _) = recognize(delimited(
864+
tag_no_case(b"<script"),
865+
take_till(is_closing_bracket),
866+
closing_bracket,
867+
))(input)?;
870868

871-
// Parse closing tag
872-
let closing_start = input;
873-
let (input, _) = tag_no_case(b"</script")(input)?;
874-
let (input, _) = multispace0(input)?;
875-
let (input, _) = closing_bracket(input)?;
876-
let closing = &closing_start[..closing_start.len() - input.len()];
869+
// Parse content (if any) and closing tag (if any)
870+
let (input, _) = opt(tuple((
871+
script_content,
872+
recognize(delimited(
873+
tag_no_case(b"</script"),
874+
multispace0,
875+
closing_bracket,
876+
)),
877+
)))(input)?;
877878

879+
// Return entire script tag as single HTML element
880+
let full_script = &start[..start.len() - input.len()];
878881
Ok((
879882
input,
880-
vec![
881-
Element::Html(slice_as_bytes(original, opening)),
882-
Element::Text(slice_as_bytes(original, content)),
883-
Element::Html(slice_as_bytes(original, closing)),
884-
],
883+
vec![Element::Html(slice_as_bytes(original, full_script))],
885884
))
886885
}
887886

@@ -930,18 +929,18 @@ fn is_lower_alphanumeric_or_underscore(c: u8) -> bool {
930929
c.is_ascii_lowercase() || c.is_ascii_digit() || c == b'_'
931930
}
932931

933-
fn fn_name(input: &[u8]) -> IResult<&[u8], String, Error<&[u8]>> {
932+
fn esi_fn_name(input: &[u8]) -> IResult<&[u8], String, Error<&[u8]>> {
934933
map(
935934
preceded(tag(b"$"), take_while1(is_lower_alphanumeric_or_underscore)),
936935
bytes_to_string,
937936
)(input)
938937
}
939938

940-
fn var_name(input: &[u8]) -> IResult<&[u8], Expr, Error<&[u8]>> {
939+
fn esi_var_name(input: &[u8]) -> IResult<&[u8], Expr, Error<&[u8]>> {
941940
map(
942941
tuple((
943942
take_while1(is_alphanumeric_or_underscore),
944-
opt(delimited(tag(b"{"), var_key_expr, tag(b"}"))),
943+
opt(delimited(tag(b"{"), esi_var_key_expr, tag(b"}"))),
945944
opt(preceded(tag(b"|"), fn_nested_argument)),
946945
)),
947946
|(name, key, default): (&[u8], _, _)| {
@@ -964,7 +963,11 @@ fn not_dollar_or_curlies(input: &[u8]) -> IResult<&[u8], String, Error<&[u8]>> {
964963
// TODO: handle escaping
965964
fn single_quoted_string(input: &[u8]) -> IResult<&[u8], String, Error<&[u8]>> {
966965
map(
967-
delimited(tag(b"'"), take_while(|c| c != b'\''), tag(b"'")),
966+
delimited(
967+
single_quote,
968+
take_while(|c| !is_single_quote(c)),
969+
single_quote,
970+
),
968971
bytes_to_string,
969972
)(input)
970973
}
@@ -996,11 +999,11 @@ fn var_key(input: &[u8]) -> IResult<&[u8], String, Error<&[u8]>> {
996999
))(input)
9971000
}
9981001

999-
// Parse subscript key - can be a string or a nested variable expression
1000-
fn var_key_expr(input: &[u8]) -> IResult<&[u8], Expr, Error<&[u8]>> {
1002+
/// Parse subscript key - can be a string or a nested variable expression
1003+
fn esi_var_key_expr(input: &[u8]) -> IResult<&[u8], Expr, Error<&[u8]>> {
10011004
alt((
10021005
// Try to parse as a variable first (e.g., $(keyVar))
1003-
variable,
1006+
esi_variable,
10041007
// Otherwise parse as a string
10051008
map(var_key, |s: String| Expr::String(Some(s))),
10061009
))(input)
@@ -1020,7 +1023,7 @@ fn fn_argument(input: &[u8]) -> IResult<&[u8], Vec<Expr>, Error<&[u8]>> {
10201023
}
10211024

10221025
fn fn_nested_argument(input: &[u8]) -> IResult<&[u8], Expr, Error<&[u8]>> {
1023-
alt((call, variable, string, integer, bareword))(input)
1026+
alt((esi_function, esi_variable, string, integer, bareword))(input)
10241027
}
10251028

10261029
fn integer(input: &[u8]) -> IResult<&[u8], Expr, Error<&[u8]>> {
@@ -1040,9 +1043,9 @@ fn bareword(input: &[u8]) -> IResult<&[u8], Expr, Error<&[u8]>> {
10401043
)(input)
10411044
}
10421045

1043-
fn call(input: &[u8]) -> IResult<&[u8], Expr, Error<&[u8]>> {
1046+
fn esi_function(input: &[u8]) -> IResult<&[u8], Expr, Error<&[u8]>> {
10441047
let (input, parsed) = tuple((
1045-
fn_name,
1048+
esi_fn_name,
10461049
delimited(
10471050
terminated(tag(b"("), multispace0),
10481051
fn_argument,
@@ -1055,8 +1058,8 @@ fn call(input: &[u8]) -> IResult<&[u8], Expr, Error<&[u8]>> {
10551058
Ok((input, Expr::Call(name, args)))
10561059
}
10571060

1058-
fn variable(input: &[u8]) -> IResult<&[u8], Expr, Error<&[u8]>> {
1059-
delimited(tag(b"$("), var_name, tag(b")"))(input)
1061+
fn esi_variable(input: &[u8]) -> IResult<&[u8], Expr, Error<&[u8]>> {
1062+
delimited(tag(b"$("), esi_var_name, tag(b")"))(input)
10601063
}
10611064

10621065
fn operator(input: &[u8]) -> IResult<&[u8], Operator, Error<&[u8]>> {
@@ -1076,7 +1079,9 @@ fn operator(input: &[u8]) -> IResult<&[u8], Operator, Error<&[u8]>> {
10761079
}
10771080

10781081
fn interpolated_expression(input: &[u8]) -> IResult<&[u8], Vec<Element>, Error<&[u8]>> {
1079-
map(alt((call, variable)), |expr| vec![Element::Expr(expr)])(input)
1082+
map(alt((esi_function, esi_variable)), |expr| {
1083+
vec![Element::Expr(expr)]
1084+
})(input)
10801085
}
10811086

10821087
fn primary_expr(input: &[u8]) -> IResult<&[u8], Expr, Error<&[u8]>> {
@@ -1093,8 +1098,8 @@ fn primary_expr(input: &[u8]) -> IResult<&[u8], Expr, Error<&[u8]>> {
10931098
tag(b")"),
10941099
),
10951100
// Parse basic expressions
1096-
call,
1097-
variable,
1101+
esi_function,
1102+
esi_variable,
10981103
integer,
10991104
string,
11001105
))(input)
@@ -1164,7 +1169,7 @@ mod tests {
11641169
}
11651170

11661171
#[test]
1167-
fn test_new_parse() {
1172+
fn test_parse() {
11681173
let input = br#"
11691174
<a>foo</a>
11701175
<bar />
@@ -1221,35 +1226,33 @@ exception!
12211226
}
12221227
}
12231228
#[test]
1224-
fn test_new_parse_script() {
1229+
fn test_parse_script() {
12251230
let input = b"<sCripT> less < more </scRIpt>";
12261231
let bytes = Bytes::from_static(input);
12271232
let (rest, x) = html_script_tag(&bytes, input).unwrap();
12281233
assert_eq!(rest.len(), 0);
12291234
assert_eq!(
12301235
x,
1231-
[
1232-
Element::Html(Bytes::from_static(b"<sCripT>")),
1233-
Element::Text(Bytes::from_static(b" less < more ")),
1234-
Element::Html(Bytes::from_static(b"</scRIpt>"))
1235-
]
1236+
[Element::Html(Bytes::from_static(
1237+
b"<sCripT> less < more </scRIpt>"
1238+
))]
12361239
);
12371240
}
12381241
#[test]
1239-
fn test_new_parse_script_with_src() {
1240-
let input = b"<sCripT src=\"whatever\">";
1242+
fn test_parse_script_with_src() {
1243+
let input = b"<sCripT src=\"whatever\"></sCripT>";
12411244
let bytes = Bytes::from_static(input);
1242-
let (rest, x) = parse_complete(&bytes).unwrap();
1245+
let (rest, x) = html_script_tag(&bytes, input).unwrap();
12431246
assert_eq!(rest.len(), 0);
12441247
assert_eq!(
12451248
x,
12461249
[Element::Html(Bytes::from_static(
1247-
b"<sCripT src=\"whatever\">"
1250+
b"<sCripT src=\"whatever\"></sCripT>"
12481251
))]
12491252
);
12501253
}
12511254
#[test]
1252-
fn test_new_parse_esi_vars_short() {
1255+
fn test_parse_esi_vars_short() {
12531256
let input = br#"<esi:vars name="$(hello)"/>"#;
12541257
let bytes = Bytes::from_static(input);
12551258
let (rest, x) = esi_vars(&bytes, input).unwrap();
@@ -1264,7 +1267,7 @@ exception!
12641267
);
12651268
}
12661269
#[test]
1267-
fn test_new_parse_esi_vars_long() {
1270+
fn test_parse_esi_vars_long() {
12681271
// Nested <esi:vars> tags are not supported to prevent infinite recursion
12691272
// The inner <esi:vars> tags should be treated as plain text/HTML
12701273
let input = br#"<esi:vars>hello<br></esi:vars>"#;
@@ -1297,7 +1300,7 @@ exception!
12971300
);
12981301
}
12991302
#[test]
1300-
fn test_new_parse_complex_expr() {
1303+
fn test_parse_complex_expr() {
13011304
let input = br#"<esi:vars name="$call('hello') matches $(var{'key'})"/>"#;
13021305
let bytes = Bytes::from_static(input);
13031306
let (rest, x) = parse_complete(&bytes).unwrap();
@@ -1396,15 +1399,15 @@ exception!
13961399
}
13971400

13981401
#[test]
1399-
fn test_new_parse_plain_text() {
1402+
fn test_parse_plain_text() {
14001403
let input = b"hello\nthere";
14011404
let bytes = Bytes::from_static(input);
14021405
let (rest, x) = parse_complete(&bytes).unwrap();
14031406
assert_eq!(rest.len(), 0);
14041407
assert_eq!(x, [Element::Text(Bytes::from_static(b"hello\nthere"))]);
14051408
}
14061409
#[test]
1407-
fn test_new_parse_interpolated() {
1410+
fn test_parse_interpolated() {
14081411
let input = b"hello $(foo)<esi:vars>goodbye $(foo)</esi:vars>";
14091412
let bytes = Bytes::from_static(input);
14101413
let (rest, x) = parse_complete(&bytes).unwrap();
@@ -1419,7 +1422,7 @@ exception!
14191422
);
14201423
}
14211424
#[test]
1422-
fn test_new_parse_examples() {
1425+
fn test_parse_examples() {
14231426
let input = include_bytes!("../../examples/esi_vars_example/src/index.html");
14241427
let bytes = Bytes::from_static(input);
14251428
let (rest, _) = parse_complete(&bytes).unwrap();

0 commit comments

Comments
 (0)