Make combine_corpus only look at folders with a corpus description (#290)

Currently the combine corpus tooling just looks through all files and
folders in a directory, assuming them all to be folders with a
corpus_description.json in them. We don't actually throw an error (only
logging one), but it makes sense to be more robust and not log an error
when we do something like iterate over a logging file.
diff --git a/compiler_opt/tools/combine_training_corpus_lib.py b/compiler_opt/tools/combine_training_corpus_lib.py
index 4547e2b..0359961 100644
--- a/compiler_opt/tools/combine_training_corpus_lib.py
+++ b/compiler_opt/tools/combine_training_corpus_lib.py
@@ -28,17 +28,13 @@
   module_names = []
   output_corpus_description = {}
 
-  for sub_dir in tf.io.gfile.listdir(root_dir):
-    path = os.path.join(root_dir, sub_dir, _FILE_NAME)
+  corpus_description_glob = os.path.join(root_dir, '*/' + _FILE_NAME)
+  for corpus_description_path in tf.io.gfile.glob(corpus_description_glob):
+    logging.info('processing %s', corpus_description_path)
 
-    logging.info('processing %s', path)
-
-    if not tf.io.gfile.exists(path):
-      logging.error('%s does not exist.', path)
-      continue
-
-    with tf.io.gfile.GFile(path, 'r') as f:
+    with tf.io.gfile.GFile(corpus_description_path, 'r') as f:
       corpus_description = json.load(f)
+      sub_dir = os.path.basename(os.path.dirname(corpus_description_path))
       module_names.extend([
           os.path.join(sub_dir, name) for name in corpus_description['modules']
       ])
diff --git a/compiler_opt/tools/combine_training_corpus_test.py b/compiler_opt/tools/combine_training_corpus_test.py
index f1c6697..47dd602 100644
--- a/compiler_opt/tools/combine_training_corpus_test.py
+++ b/compiler_opt/tools/combine_training_corpus_test.py
@@ -54,6 +54,36 @@
     self.assertIn('subcorpus2/test3.o', combined_corpus_description['modules'])
     self.assertIn('subcorpus2/test4.o', combined_corpus_description['modules'])
 
+  def test_empty_folder(self):
+    corpus_dir = self.create_tempdir()
+    subcorpus1_dir = corpus_dir.mkdir(dir_path='subcorpus1')
+    _ = corpus_dir.mkdir(dir_path='empty_dir')
+    subcorpus1_description = {'modules': ['test1.o', 'test2.o']}
+    subcorpus1_description_file = subcorpus1_dir.create_file(
+        file_path='corpus_description.json')
+    subcorpus1_description_file.write_text(json.dumps(subcorpus1_description))
+    combine_training_corpus_lib.combine_corpus(corpus_dir.full_path)
+    with open(
+        os.path.join(corpus_dir, 'corpus_description.json'),
+        encoding='utf-8') as combined_corpus_description_file:
+      combined_corpus_description = json.load(combined_corpus_description_file)
+    self.assertLen(combined_corpus_description['modules'], 2)
+
+  def test_ignore_extra_file(self):
+    corpus_dir = self.create_tempdir()
+    subcorpus1_dir = corpus_dir.mkdir(dir_path='subcorpus1')
+    _ = corpus_dir.create_file(file_path='empty.log')
+    subcorpus1_description = {'modules': ['test1.o', 'test2.o']}
+    subcorpus1_description_file = subcorpus1_dir.create_file(
+        file_path='corpus_description.json')
+    subcorpus1_description_file.write_text(json.dumps(subcorpus1_description))
+    combine_training_corpus_lib.combine_corpus(corpus_dir.full_path)
+    with open(
+        os.path.join(corpus_dir, 'corpus_description.json'),
+        encoding='utf-8') as combined_corpus_description_file:
+      combined_corpus_description = json.load(combined_corpus_description_file)
+    self.assertLen(combined_corpus_description['modules'], 2)
+
   def test_different_corpora(self):
     corpus_dir = self.create_tempdir()
     subcorpus1_dir = corpus_dir.mkdir(dir_path='subcorpus1')