Optimize cases with long potential simple_keys (#555)
This change introduces an index to lookup token numbers referenced by simple_keys in O(1),
thus significantly reducing the performance impact of certain abusively constructed snippets.
When we build up the simple_keys stack, we count on the (formerly named) staleness check to
catch errors where a simple key is required but would be > 1024 chars or span lines. The previous
simplification that searches the stack from the top can go 1024 keys deep before finding a "stale"
key and stopping. I added a test that shows that this consumes ~3s per 1MB of document size.
diff --git a/limit_test.go b/limit_test.go
index f160c87..8d8ec2d 100644
--- a/limit_test.go
+++ b/limit_test.go
@@ -39,6 +39,7 @@
{name: "1000kb of maps", data: []byte(`a: &a [{a}` + strings.Repeat(`,{a}`, 1000*1024/4-1) + `]`)},
{name: "1000kb slice nested at max-depth", data: []byte(strings.Repeat(`[`, 10000) + `1` + strings.Repeat(`,1`, 1000*1024/2-20000-1) + strings.Repeat(`]`, 10000))},
{name: "1000kb slice nested in maps at max-depth", data: []byte("{a,b:\n" + strings.Repeat(" {a,b:", 10000-2) + ` [1` + strings.Repeat(",1", 1000*1024/2-6*10000-1) + `]` + strings.Repeat(`}`, 10000-1))},
+ {name: "1000kb of 10000-nested lines", data: []byte(strings.Repeat(`- `+strings.Repeat(`[`, 10000)+strings.Repeat(`]`, 10000)+"\n", 1000*1024/20000))},
}
func (s *S) TestLimits(c *C) {
@@ -92,6 +93,10 @@
benchmark(b, "1000kb slice nested in maps at max-depth")
}
+func Benchmark1000KBMaxDepthNested(b *testing.B) {
+ benchmark(b, "1000kb of 10000-nested lines")
+}
+
func benchmark(b *testing.B, name string) {
for _, t := range limitTests {
if t.name != name {
diff --git a/scannerc.go b/scannerc.go
index b33bdba..0b9bb60 100644
--- a/scannerc.go
+++ b/scannerc.go
@@ -626,32 +626,18 @@
func yaml_parser_fetch_more_tokens(parser *yaml_parser_t) bool {
// While we need more tokens to fetch, do it.
for {
- // Check if we really need to fetch more tokens.
- need_more_tokens := false
-
- if parser.tokens_head == len(parser.tokens) {
- // Queue is empty.
- need_more_tokens = true
- } else {
- // Check if any potential simple key may occupy the head position.
- for i := len(parser.simple_keys) - 1; i >= 0; i-- {
- simple_key := &parser.simple_keys[i]
- if simple_key.token_number < parser.tokens_parsed {
- break
- }
- if valid, ok := yaml_simple_key_is_valid(parser, simple_key); !ok {
- return false
- } else if valid && simple_key.token_number == parser.tokens_parsed {
- need_more_tokens = true
- break
- }
+ if parser.tokens_head != len(parser.tokens) {
+ // If queue is non-empty, check if any potential simple key may
+ // occupy the head position.
+ head_tok_idx, ok := parser.simple_keys_by_tok[parser.tokens_parsed]
+ if !ok {
+ break
+ } else if valid, ok := yaml_simple_key_is_valid(parser, &parser.simple_keys[head_tok_idx]); !ok {
+ return false
+ } else if !valid {
+ break
}
}
-
- // We are finished.
- if !need_more_tokens {
- break
- }
// Fetch the next token.
if !yaml_parser_fetch_next_token(parser) {
return false
@@ -883,6 +869,7 @@
return false
}
parser.simple_keys[len(parser.simple_keys)-1] = simple_key
+ parser.simple_keys_by_tok[simple_key.token_number] = len(parser.simple_keys) - 1
}
return true
}
@@ -897,9 +884,10 @@
"while scanning a simple key", parser.simple_keys[i].mark,
"could not find expected ':'")
}
+ // Remove the key from the stack.
+ parser.simple_keys[i].possible = false
+ delete(parser.simple_keys_by_tok, parser.simple_keys[i].token_number)
}
- // Remove the key from the stack.
- parser.simple_keys[i].possible = false
return true
}
@@ -930,7 +918,9 @@
func yaml_parser_decrease_flow_level(parser *yaml_parser_t) bool {
if parser.flow_level > 0 {
parser.flow_level--
- parser.simple_keys = parser.simple_keys[:len(parser.simple_keys)-1]
+ last := len(parser.simple_keys) - 1
+ delete(parser.simple_keys_by_tok, parser.simple_keys[last].token_number)
+ parser.simple_keys = parser.simple_keys[:last]
}
return true
}
@@ -1007,6 +997,8 @@
// Initialize the simple key stack.
parser.simple_keys = append(parser.simple_keys, yaml_simple_key_t{})
+ parser.simple_keys_by_tok = make(map[int]int)
+
// A simple key is allowed at the beginning of the stream.
parser.simple_key_allowed = true
@@ -1310,6 +1302,7 @@
// Remove the simple key.
simple_key.possible = false
+ delete(parser.simple_keys_by_tok, simple_key.token_number)
// A simple key cannot follow another simple key.
parser.simple_key_allowed = false
diff --git a/yamlh.go b/yamlh.go
index e25cee5..f6a9c8e 100644
--- a/yamlh.go
+++ b/yamlh.go
@@ -579,6 +579,7 @@
simple_key_allowed bool // May a simple key occur at the current position?
simple_keys []yaml_simple_key_t // The stack of simple keys.
+ simple_keys_by_tok map[int]int // possible simple_key indexes indexed by token_number
// Parser stuff