[intl] Tightens the string naming controls

In an early field test, lucasradaelli@ discovered that `strings_to_fidl`
let an identifier name such as "role_table," go through. Notice the
extra unwanted comma at the end.

While `strings_to_fidl` doesn't really have a preference, the FIDL file
that is generated based on strings.xml ends up with unallowed syntax.

So, we tighten the behavior to fail fast instead of generating a FIDL
file that won't pass a followup FIDL compilation step.

Example error given below:

```
╰─>$ ./strings_to_fidl --input=$HOME/strings.xml --output=$HOME/strings.xml.out ; cat $HOME/strings.xml.out
Error: while parsing dictionary

Caused by:
    0: in state: String { name: "string_name,", text: "text_string" }
    1: name is not acceptable: 'string_name,', does not match: ^[_a-zA-Z][_a-zA-Z_0-9]*$
```

Bug: 50720
Change-Id: Id178b3c4af434803ced1251a284c2a43947937c8
Reviewed-on: https://fuchsia-review.googlesource.com/c/fuchsia/+/385229
Commit-Queue: Filip Filmar <fmil@google.com>
Reviewed-by: Konstantin Pozin <kpozin@google.com>
Testability-Review: Filip Filmar <fmil@google.com>
diff --git a/src/lib/intl/strings/BUILD.gn b/src/lib/intl/strings/BUILD.gn
index a665f82..b4daf51 100644
--- a/src/lib/intl/strings/BUILD.gn
+++ b/src/lib/intl/strings/BUILD.gn
@@ -27,9 +27,10 @@
     deps = [
       "//third_party/rust_crates:anyhow",
       "//third_party/rust_crates:handlebars",
+      "//third_party/rust_crates:lazy_static",
+      "//third_party/rust_crates:regex",
       "//third_party/rust_crates:serde",
       "//third_party/rust_crates:serde_json",
-      "//third_party/rust_crates:structopt",
       "//third_party/rust_crates:thiserror",
       "//third_party/rust_crates:xml-rs",
     ]
diff --git a/src/lib/intl/strings/src/parser.rs b/src/lib/intl/strings/src/parser.rs
index 67a0b99..fa7da39 100644
--- a/src/lib/intl/strings/src/parser.rs
+++ b/src/lib/intl/strings/src/parser.rs
@@ -6,6 +6,8 @@
 
 use {
     anyhow::{anyhow, Context, Error, Result},
+    lazy_static::lazy_static,
+    regex::Regex,
     std::collections::BTreeMap,
     std::io,
     xml::attribute,
@@ -93,6 +95,31 @@
     Done,
 }
 
+lazy_static! {
+    // All string names must match this regex string. (This should be fixed up
+    // to match the FIDL naming rules)
+    //
+    // Example:
+    // __hello_WORLD -- acceptable
+    // 0cool -- not acceptable, leading number
+    // добар_дан -- not acceptable, not A-Z.
+    static ref RAW_REGEX_STRING: &'static str = r"^[_a-zA-Z][_a-zA-Z_0-9]*$";
+
+    static ref VALID_TOKEN_NAME: Regex = Regex::new(&RAW_REGEX_STRING).unwrap();
+}
+
+// Verifies that [name] is a valid string name.
+pub(crate) fn validate_token_name(name: &str, verbose: bool) -> bool {
+    let is_match = VALID_TOKEN_NAME.is_match(name);
+    veprintln!(
+        verbose,
+        "validate_token_name: checking identifier: '{}'; is_match: {}",
+        name,
+        is_match
+    );
+    is_match
+}
+
 /// The XML parser state.
 #[derive(Debug)]
 pub struct Instance {
@@ -210,6 +237,13 @@
                 if self.messages.contains_key(&token_name) {
                     return Err(anyhow!("duplicate string with name: {}", token_name));
                 }
+                if !validate_token_name(&token_name, self.verbose) {
+                    return Err(anyhow!(
+                        "name is not acceptable: '{}', does not match: {}",
+                        &token_name,
+                        *RAW_REGEX_STRING
+                    ));
+                }
                 self.messages.insert(token_name, text);
                 self.state = State::Resources;
             }
@@ -432,6 +466,19 @@
                </resources>
             "#,
             },
+            TestCase {
+                name: "string_name has unexpected comma",
+                content: r#"
+               <!-- comment -->
+               <?xml version="1.0" encoding="utf-8"?>
+               <resources>
+                 <!-- comment -->
+                 <string
+                   name="string_name,"
+                     >text_string</string>
+               </resources>
+            "#,
+            },
         ];
         for test in tests {
             let input = EventReader::from_str(&test.content);
@@ -446,4 +493,23 @@
         }
         Ok(())
     }
+
+    use super::validate_token_name;
+
+    #[test]
+    fn acceptable_and_unacceptable_names() -> Result<(), Error> {
+        assert_eq!(true, validate_token_name("_hello", true /*verbose*/));
+        assert_eq!(true, validate_token_name("__hello", true /*verbose*/));
+        assert_eq!(true, validate_token_name("__HELLO__", true /*verbose*/));
+        assert_eq!(true, validate_token_name("__H_e_l_l_o__", true /*verbose*/));
+        assert_eq!(true, validate_token_name("__H_e_l_l_o_1234__", true /*verbose*/));
+
+        // Starts with zero
+        assert_eq!(false, validate_token_name("0cool", true /*verbose*/));
+        // Unexpected comma.
+        assert_eq!(false, validate_token_name("role_table,", true /*verbose*/));
+        // Unicode overdose.
+        assert_eq!(false, validate_token_name("хелло_њорлд", true /*verbose*/));
+        Ok(())
+    }
 }