blob: daa6d2b47ceca7e9ff88bddae9772f1e62f8392a [file] [log] [blame]
# Copyright 2023 The Bazel Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# parse.py is a long-living program that communicates over STDIN and STDOUT.
# STDIN receives parse requests, one per line. It outputs the parsed modules and
# comments from all the files from each request.
import ast
import concurrent.futures
import json
import os
import sys
from io import BytesIO
from tokenize import COMMENT, NAME, OP, STRING, tokenize
def parse_import_statements(content, filepath):
modules = list()
tree = ast.parse(content, filename=filepath)
for node in ast.walk(tree):
if isinstance(node, ast.Import):
for subnode in node.names:
module = {
"name": subnode.name,
"lineno": node.lineno,
"filepath": filepath,
"from": "",
}
modules.append(module)
elif isinstance(node, ast.ImportFrom) and node.level == 0:
for subnode in node.names:
module = {
"name": f"{node.module}.{subnode.name}",
"lineno": node.lineno,
"filepath": filepath,
"from": node.module,
}
modules.append(module)
return modules
def parse_comments(content):
comments = list()
g = tokenize(BytesIO(content.encode("utf-8")).readline)
for toknum, tokval, _, _, _ in g:
if toknum == COMMENT:
comments.append(tokval)
return comments
def parse_main(content):
g = tokenize(BytesIO(content.encode("utf-8")).readline)
for token_type, token_val, start, _, _ in g:
if token_type != NAME or token_val != "if" or start[1] != 0:
continue
try:
token_type, token_val, start, _, _ = next(g)
if token_type != NAME or token_val != "__name__":
continue
token_type, token_val, start, _, _ = next(g)
if token_type != OP or token_val != "==":
continue
token_type, token_val, start, _, _ = next(g)
if token_type != STRING or token_val.strip("\"'") != '__main__':
continue
token_type, token_val, start, _, _ = next(g)
if token_type != OP or token_val != ":":
continue
return True
except StopIteration:
break
return False
def parse(repo_root, rel_package_path, filename):
rel_filepath = os.path.join(rel_package_path, filename)
abs_filepath = os.path.join(repo_root, rel_filepath)
with open(abs_filepath, "r") as file:
content = file.read()
# From simple benchmarks, 2 workers gave the best performance here.
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
modules_future = executor.submit(
parse_import_statements, content, rel_filepath
)
comments_future = executor.submit(parse_comments, content)
main_future = executor.submit(parse_main, content)
modules = modules_future.result()
comments = comments_future.result()
has_main = main_future.result()
output = {
"filename": filename,
"modules": modules,
"comments": comments,
"has_main": has_main,
}
return output
def main(stdin, stdout):
with concurrent.futures.ProcessPoolExecutor() as executor:
for parse_request in stdin:
parse_request = json.loads(parse_request)
repo_root = parse_request["repo_root"]
rel_package_path = parse_request["rel_package_path"]
filenames = parse_request["filenames"]
outputs = list()
if len(filenames) == 1:
outputs.append(parse(repo_root, rel_package_path, filenames[0]))
else:
futures = [
executor.submit(parse, repo_root, rel_package_path, filename)
for filename in filenames
if filename != ""
]
for future in concurrent.futures.as_completed(futures):
outputs.append(future.result())
print(json.dumps(outputs), end="", file=stdout, flush=True)
stdout.buffer.write(bytes([0]))
stdout.flush()
if __name__ == "__main__":
exit(main(sys.stdin, sys.stdout))