| // Copyright 2022 The Fuchsia Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| //! link_checker implements the [`DocCheck` trait used to perform checks on the links and images |
| //! found in markdown documentation in the Fuchsia project. |
| |
| use crate::{ |
| md_element::{CowStr, Element, LinkType}, |
| DocCheck, DocCheckError, DocCheckerArgs, DocLine, |
| }; |
| use anyhow::{bail, Result}; |
| use async_trait::async_trait; |
| use fuchsia_hyper::{new_https_client_from_tcp_options, HttpsClient, TcpOptions}; |
| use http::{uri::Uri, Request, StatusCode}; |
| use hyper::Body; |
| use std::{ |
| collections::{HashMap, HashSet}, |
| ffi::OsStr, |
| path::{self, Path, PathBuf}, |
| }; |
| use url::Url; |
| |
| // path_help is a wrapper to allow mocking path checks |
| // exists. and is_dir. |
| cfg_if::cfg_if! { |
| if #[cfg(test)] { |
| use crate::mock_path_helper_module as path_helper; |
| } else { |
| use crate::path_helper_module as path_helper; |
| } |
| } |
| /// Files that are allowed to link to the documentation host site. |
| const FILES_ALLOWED_TO_LINK_TO_PUBLISHED_DOCS: [&str; 1] = ["navbar.md"]; |
| |
| const GERRIT_HOST: &str = "fuchsia.googlesource.com"; |
| |
| pub(crate) const PUBLISHED_DOCS_HOST: &str = "fuchsia.dev"; |
| |
| /// List of words that cannot be used as single word ALT text for images. |
| /// This is a pretty small list (n < 5), but if it grows large, it might be |
| /// better to manages as an external file vs. inline. |
| const DISALLOWED_ALT_IMAGE_TEXT: [&str; 1] = [""]; |
| // TODO(fxbug.dev/113039): disallow "drawing, "image" for alt text"; |
| |
| /// List of active repos under fuchsia.googlesource.com which can be linked to. |
| const VALID_PROJECTS: [&str; 19] = [ |
| "", // root page of all projects |
| "cobalt", |
| "drivers", // This is a family of projects. |
| "experiences", |
| "fargo", |
| "fidl-misc", |
| "fidlbolt", |
| "fontdata", |
| "fuchsia", |
| "infra", // This is a family of projects, there are sub-repos below this path. |
| "integration", |
| "intellij-language-fidl", |
| "jiri", |
| "manifest", |
| "third_party", // This is a family of projects, there are sub-repos below this path. |
| "vscode-language-fidl", |
| "workstation", |
| "samples", |
| "sdk-samples", // This is a family of projects, there are sub-repos below this path. |
| ]; |
| |
| /// Top level paths to the published doc site that any page can link to. |
| /// Links to other locations have to be allowed by adding the source doc |
| /// page to FILES_ALLOWED_TO_LINK_TO_PUBLISHED_DOCS. |
| /// "" - is the root of fuchsia.dev |
| /// "reference" is the generated reference documentation. |
| /// "schema" is the schema URLs used for parsing. |
| const PUBLISHED_LINKS_ALLOWED: [&str; 3] = ["", "reference", "schema"]; |
| |
| /// A link (URL, or file path) and it location in the markdown. |
| #[derive(Debug, Eq, Hash, PartialEq)] |
| pub struct LinkReference { |
| pub link: String, |
| pub location: DocLine, |
| } |
| |
| /// LinkChecker checks the links and images in markdown files. |
| /// As the links are checked, links to external websites are collected and optionally |
| /// checked in the post-check. |
| #[derive(Debug)] |
| struct LinkChecker { |
| pub root_dir: PathBuf, |
| pub project: String, |
| pub docs_folder: PathBuf, |
| pub check_remote_links: bool, |
| links: Vec<LinkReference>, |
| } |
| |
| impl LinkChecker { |
| /// Takes the raw link from the markdown parser, and normalizes it into |
| /// a string that is a filepath or URL. |
| fn make_link_to_check(&self, filename: &Path, link_url: &CowStr<'_>) -> Result<String> { |
| let link_to_check: String; |
| let link = link_url.trim(); |
| let filename_string = filename.to_string_lossy(); |
| |
| // relative_filename is relative to the root e.g. /home/googler/fuchsia/docs/file.md |
| // is /docs/file.md. Note that this form (with the leading /) is used a lot because |
| // when published, it is at the root of the documentation site. |
| // |
| // This does cause confusion in this code since PathBuf::join() does not naively join |
| // paths with a leading /. See https://doc.rust-lang.org/nightly/std/path/struct.PathBuf.html#method.push |
| // for more details. |
| let relative_filename = |
| filename_string.strip_prefix(self.root_dir.to_string_lossy().as_ref()).unwrap_or(""); |
| |
| // relative_parent is the directory of the file name, e.g /docs/file.md is /docs. |
| let temp_path = PathBuf::from(relative_filename); |
| let relative_parent = temp_path.parent().unwrap_or(&self.docs_folder); |
| |
| // External links that have any query parameters are decoded when parsed by the markdown parser. |
| // To make things easier later, parse the URL and use the encoded parameters. |
| if link.starts_with("http://") || link.starts_with("https://") { |
| let url = Url::parse(link).or_else(|e| bail!("Could not parse url {}: {}", link, e))?; |
| if let Some(query) = url.query() { |
| // split on ? from the original string, to avoid complexities |
| // to to-stringizing a url without the query params. |
| if let Some((first_part, _)) = link.split_once('?') { |
| let encoded_link = format!("{}?{}", first_part, query); |
| link_to_check = encoded_link; |
| } else { |
| bail!("Cannot parse {}. Appears to have query parameters, but no ? in the string?", link); |
| } |
| } else { |
| link_to_check = url.to_string(); |
| } |
| } else if link.starts_with("/reference") { |
| // Generated reference docs are in /reference, and are |
| // treated as external since they are not part of the source tree. |
| link_to_check = format!("https://{}{}", PUBLISHED_DOCS_HOST, link); |
| } else if link.starts_with('/') { |
| // paths are used as-is. |
| link_to_check = link.to_string(); |
| } else if link.starts_with('#') { |
| // Anchors are appended to the current file. |
| link_to_check = format!("{}{}", self.root_dir.join(relative_filename).display(), link); |
| } else { |
| // Otherwise, see if it is parsable as a URI, if not, append it to the relative_parent |
| // and hope for the best. This usually is something relative like "details-subdir/info.md" |
| let uri: Uri = match link.parse() { |
| Ok(u) => u, |
| Err(_e) => { |
| let parent_based_link = |
| format!("{}/{}", relative_parent.to_string_lossy(), link); |
| parent_based_link.parse::<Uri>().or_else(|e| { |
| bail!("Cannot parse parent based uri: {}: {}", parent_based_link, e) |
| })? |
| } |
| }; |
| |
| // Check the scheme. If there is one, use it. |
| // If there is mailto: (which is commonly used without the //) use the original link |
| // Otherwise, it is a relative file path that what parsed. |
| link_to_check = match uri.scheme() { |
| Some(_) => uri.to_string(), |
| None if link.contains("mailto:") => link.to_string(), |
| None => format!("{}/{}", relative_parent.to_str().unwrap(), link), |
| }; |
| } |
| Ok(link_to_check) |
| } |
| } |
| |
| #[async_trait] |
| impl DocCheck for LinkChecker { |
| fn name(&self) -> &str { |
| "LinkChecker" |
| } |
| /// Applies the checks for links. |
| fn check<'a>(&mut self, element: &'a Element<'_>) -> Result<Option<Vec<DocCheckError>>> { |
| let mut errors: Vec<DocCheckError> = vec![]; |
| |
| // Get all the links from the element. This is needed since the element is commonly |
| // a Block or some other collection of elements. |
| if let Some(links) = element.get_links() { |
| for ele in links { |
| let link: &'a CowStr<'a> = match ele { |
| Element::Link(link_type, link_url, link_title, elements, _) => { |
| let link = match link_type { |
| LinkType::Inline => link_url, |
| LinkType::Reference => link_url, |
| LinkType::ReferenceUnknown => { |
| let text = elements |
| .iter() |
| .map(|e| e.get_contents()) |
| .collect::<Vec<String>>() |
| .join(""); |
| if link_url.starts_with("\"") && link_url.ends_with("\"") { |
| // This is not a link but an array of quoted strings. |
| continue; |
| } |
| if text == link_url.to_string() && link_url == link_title { |
| errors.push(DocCheckError::new_info_helpful( |
| ele.doc_line().line_num, |
| ele.doc_line().file_name.clone(), |
| &format!( |
| "unescaped [{}] not treating this as a reference link. this is brackets ", |
| link_url), |
| &format!("escaped \\[{}\\] or make a link [{}](/docs/{}", link_title, link_url,link_url) |
| )); |
| } else { |
| errors.push(DocCheckError::new_error_helpful( |
| ele.doc_line().line_num, |
| ele.doc_line().file_name.clone(), |
| &format!( |
| "Unknown reference link to [{}][{}]", |
| text ,link_url |
| ), |
| &format!( |
| "making sure you added a matching [{}]: YOUR_LINK_HERE below this reference", |
| link_url))); |
| } |
| continue; |
| } |
| LinkType::Collapsed => link_url, |
| LinkType::CollapsedUnknown => { |
| errors.push(DocCheckError::new_error( |
| ele.doc_line().line_num, |
| ele.doc_line().file_name.clone(), |
| &format!( |
| "Unknown collapsed link to {} ({})", |
| link_url, link_title |
| ), |
| )); |
| link_url |
| } |
| LinkType::Shortcut => link_url, |
| LinkType::ShortcutUnknown => { |
| // Check if this is a case where the text is in []. |
| let text = elements |
| .iter() |
| .map(|e| e.get_contents()) |
| .collect::<Vec<String>>() |
| .join(""); |
| if link_url.starts_with("\"") && link_url.ends_with("\"") { |
| // This is not a link but an array of quoted strings. |
| continue; |
| } |
| if text == link_url.to_string() && link_url == link_title { |
| errors.push(DocCheckError::new_info_helpful( |
| ele.doc_line().line_num, |
| ele.doc_line().file_name.clone(), |
| &format!( |
| "unescaped [{}] not treating this as a shortcut link.", |
| link_url |
| ), |
| &format!( |
| "escaped \\[{}\\] or make a link [{}](/docs/{}", |
| link_title, link_url, link_url |
| ), |
| )); |
| } else { |
| errors.push(DocCheckError::new_error_helpful( |
| ele.doc_line().line_num, |
| ele.doc_line().file_name.clone(), |
| &format!( |
| "Unknown reference link to [{}][{}]", |
| text ,link_url |
| ), |
| &format!( |
| "making sure you added a matching [{}]: YOUR_LINK_HERE below this reference", |
| link_url))); |
| } |
| |
| continue; |
| } |
| LinkType::Autolink => link_url, |
| LinkType::Email => return Ok(None), |
| }; |
| if link.starts_with("mailto:") { |
| // do nothing. |
| return Ok(None); |
| } |
| link |
| } |
| Element::Image(link_type, link_url, link_title, elements, _) => { |
| let link = match link_type { |
| LinkType::Inline => link_url, |
| LinkType::Reference => link_url, |
| LinkType::ReferenceUnknown => { |
| errors.push(DocCheckError::new_error( |
| ele.doc_line().line_num, |
| ele.doc_line().file_name.clone(), |
| &format!( |
| "Unknown image reference link to {} ({})", |
| link_url, link_title |
| ), |
| )); |
| link_url |
| } |
| LinkType::Collapsed => link_url, |
| LinkType::CollapsedUnknown => todo!(), |
| LinkType::Shortcut => link_url, |
| LinkType::ShortcutUnknown => { |
| // Check if this is a case where the text is in []. |
| let text = elements |
| .iter() |
| .map(|e| e.get_contents()) |
| .collect::<Vec<String>>() |
| .join(""); |
| if text == link_url.to_string() && link_url == link_title { |
| errors.push(DocCheckError::new_info_helpful( |
| ele.doc_line().line_num, |
| ele.doc_line().file_name.clone(), |
| &format!( |
| "unescaped brackets - treated as text [{}]", |
| link_url |
| ), |
| &format!( |
| "escaping \\[{}\\] or an inline link [{}][/docs/{}", |
| link_title, link_url, link_url |
| ), |
| )); |
| continue; |
| } else { |
| errors.push(DocCheckError::new_error( |
| ele.doc_line().line_num, |
| ele.doc_line().file_name.clone(), |
| &format!( |
| "Unknown image shortcut link to {} ({})", |
| link_url, link_title |
| ), |
| )); |
| } |
| link_url |
| } |
| LinkType::Autolink => link_url, |
| LinkType::Email => return Ok(None), |
| }; |
| if link.starts_with("mailto:") { |
| // do nothing. |
| return Ok(None); |
| } |
| // Check for alt text |
| let alt = ele.get_contents().trim().to_string(); |
| if DISALLOWED_ALT_IMAGE_TEXT.contains(&alt.as_str()) { |
| errors.push(DocCheckError::new_error( |
| ele.doc_line().line_num, |
| ele.doc_line().file_name, |
| &format!( |
| "Invalid image alt text: {:?}, cannot be one of {:?}", |
| alt, DISALLOWED_ALT_IMAGE_TEXT |
| ), |
| )) |
| } |
| link |
| } |
| _ => { |
| return Ok(None); |
| } |
| }; |
| |
| let link_to_check = |
| match self.make_link_to_check(&element.doc_line().file_name, link) { |
| Ok(link) => link, |
| Err(e) => { |
| errors.push(DocCheckError::new_error( |
| element.doc_line().line_num, |
| element.doc_line().file_name, |
| &e.to_string(), |
| )); |
| String::from("") |
| } |
| }; |
| if link_to_check.is_empty() { |
| continue; |
| } |
| |
| let saw_error = do_check_link(&element.doc_line(), &link_to_check, &self.project)? |
| .map(|err| errors.push(err)) |
| .is_some(); |
| |
| let root_dir = self.root_dir.display().to_string(); |
| match is_intree_link(&self.project, &root_dir, &self.docs_folder, &link_to_check) { |
| Ok(Some(in_tree_path)) => { |
| if let Some(link_error) = do_in_tree_check( |
| &element.doc_line(), |
| &self.root_dir, |
| &self.docs_folder, |
| &link_to_check, |
| &in_tree_path, |
| ) { |
| errors.push(link_error); |
| } |
| } |
| Ok(None) => { |
| if self.check_remote_links && !saw_error { |
| self.links.push(LinkReference { |
| link: link_to_check.clone(), |
| location: element.doc_line(), |
| }); |
| } |
| } |
| Err(e) => errors.push(DocCheckError::new_error( |
| element.doc_line().line_num, |
| element.doc_line().file_name, |
| &e.to_string(), |
| )), |
| }; |
| } |
| } |
| |
| if errors.is_empty() { |
| Ok(None) |
| } else { |
| Ok(Some(errors)) |
| } |
| } |
| |
| /// At the end, check that the out of tree links work, if requested. |
| async fn post_check(&self) -> Result<Option<Vec<DocCheckError>>> { |
| let mut errors = vec![]; |
| |
| if let Some(link_errors) = check_external_links(&self.links).await { |
| errors.extend(link_errors); |
| } |
| |
| if !errors.is_empty() { |
| Ok(Some(errors)) |
| } else { |
| Ok(None) |
| } |
| } |
| } |
| |
| /// Checks specific to a link to an in-tree path. |
| pub(crate) fn do_in_tree_check( |
| doc_line: &DocLine, |
| root_dir: &Path, |
| docs_folder: &Path, |
| link_to_check: &str, |
| in_tree_path: &Path, |
| ) -> Option<DocCheckError> { |
| let filepath = root_dir.join(in_tree_path.strip_prefix("/").unwrap_or(in_tree_path)); |
| |
| if !path_helper::exists(&filepath) { |
| return Some(DocCheckError::new_error( |
| doc_line.line_num, |
| doc_line.file_name.clone(), |
| &format!("in-tree link to {} could not be found at {:?}", link_to_check, filepath), |
| )); |
| } else if filepath.components().any(|c| c == path::Component::ParentDir) { |
| let cannonical_path = match filepath.canonicalize() { |
| Ok(p) => p, |
| Err(e) => { |
| return Some(DocCheckError::new_error( |
| doc_line.line_num, |
| doc_line.file_name.clone(), |
| &format!("Error canonicalizing path: {:?}: {}", filepath, e), |
| )) |
| } |
| }; |
| if !cannonical_path.starts_with(root_dir) { |
| return Some(DocCheckError::new_error( |
| doc_line.line_num, |
| doc_line.file_name.clone(), |
| &format!( |
| "relative path {:?} points outside root directory {:?}", |
| in_tree_path, root_dir |
| ), |
| )); |
| } |
| } else if path_helper::is_dir(&filepath) { |
| // If it is a directory to the /docs directory, that directory needs |
| // to have a README.md file. |
| if in_tree_path |
| .components() |
| .position(|c| c == path::Component::Normal(OsStr::new(&docs_folder))) |
| == Some(1) |
| { |
| let readme_path = filepath.join("README.md"); |
| if !path_helper::exists(&readme_path) { |
| return Some(DocCheckError::new_error( |
| doc_line.line_num, |
| doc_line.file_name.clone(), |
| &format!( |
| "in-tree link to {} could not be found at {:?} or {:?}", |
| link_to_check, filepath, readme_path |
| ), |
| )); |
| } |
| } |
| // Non-docs paths are OK. |
| } |
| None |
| } |
| |
| /// Parse the link into a Uri, and check that it is either a path or that the http/https |
| /// links are valid for the host they are pointing to. |
| pub(crate) fn do_check_link( |
| doc_line: &DocLine, |
| link: &str, |
| project_being_checked: &str, |
| ) -> Result<Option<DocCheckError>> { |
| match link.parse::<Uri>() { |
| Ok(uri) => { |
| match uri.scheme() { |
| Some(scheme) => match scheme.as_str() { |
| "http" | "https" => {} |
| _ => return Ok(None), |
| }, |
| None => return Ok(None), |
| } |
| if let Some(errors) = check_link_authority(doc_line, &uri, project_being_checked) { |
| return Ok(Some(errors)); |
| } |
| |
| // Check for host language parameter on google owned urls. |
| if uri |
| .authority() |
| .map(|a| a.host()) |
| .map(|host| host.ends_with(".dev") || host.ends_with(".google.com")) |
| .unwrap_or(false) |
| && uri.query().map(|query| query.contains("hl=")).unwrap_or(false) |
| { |
| return Ok(Some(DocCheckError::new_error( |
| doc_line.line_num, |
| doc_line.file_name.clone(), |
| &format!("Do not add host language parameter `hl` {}", link), |
| ))); |
| } |
| |
| Ok(None) |
| } |
| Err(e) => Ok(Some(DocCheckError::new_error( |
| doc_line.line_num, |
| doc_line.file_name.clone(), |
| &format!("Invalid link {} : {}", link, e), |
| ))), |
| } |
| } |
| |
| /// Returns the relative path from the root_dir if the link it to a file in the fuchsia source tree. |
| pub(crate) fn is_intree_link( |
| project: &str, |
| root_dir: &str, |
| docs_folder: &Path, |
| link_to_check: &str, |
| ) -> Result<Option<PathBuf>> { |
| if link_to_check.starts_with(root_dir) { |
| let mut filepath = link_to_check.strip_prefix(root_dir).unwrap_or(link_to_check); |
| (filepath, _) = filepath.split_once('#').unwrap_or((filepath, "")); |
| // Split off the query parameters, if any |
| (filepath, _) = filepath.split_once('?').unwrap_or((filepath, "")); |
| return Ok(Some(PathBuf::from(filepath))); |
| } else if link_to_check.starts_with(&format!("https://{}/{}", GERRIT_HOST, project)) { |
| let uri: Uri = match link_to_check.parse::<Uri>() { |
| Ok(uri) => uri, |
| Err(e) => bail!("Invalid Url {}: {:?}", link_to_check, e), |
| }; |
| let parts = uri.path().split('/').collect::<Vec<&str>>(); |
| if parts.len() <= 3 { |
| let p = parts.join("/"); |
| if p == format!("/{}/", project) || p == format!("/{}", project) { |
| return Ok(None); |
| } |
| } |
| // skip over any branch spec. The first part is empty. It is +, or +show, or +log, etc. |
| if parts.len() > 3 && parts[2].starts_with('+') { |
| let filepath = PathBuf::from("/"); |
| if parts[3] == "refs" && parts[4] == "heads" { |
| return Ok(Some(filepath.join(parts[6..].join("/")))); |
| } else if parts[3] == "HEAD" && parts[4] == docs_folder.to_string_lossy() { |
| return Ok(Some(filepath.join(parts[4..].join("/")))); |
| } else { |
| return Ok(None); |
| } |
| } else { |
| return Ok(Some(PathBuf::from(parts.join("/")))); |
| } |
| } else if link_to_check.starts_with('/') { |
| let (mut filepath, _) = link_to_check.split_once('#').unwrap_or((link_to_check, "")); |
| (filepath, _) = filepath.split_once('?').unwrap_or((filepath, "")); |
| |
| return match normalize_intree_path(filepath) { |
| Ok(normalized) => Ok(Some(normalized)), |
| Err(e) => Err(e), |
| }; |
| } |
| Ok(None) |
| } |
| |
| /// Checks that URLs are not incorrectly pointing to fuchsia.dev or fuchsia.googlesource.com |
| fn check_link_authority( |
| doc_line: &DocLine, |
| uri: &Uri, |
| project_being_checked: &str, |
| ) -> Option<DocCheckError> { |
| let link_to_fuchsia_gerrit_host = match uri.authority() { |
| Some(a) => *a == GERRIT_HOST, |
| None => false, |
| }; |
| let link_to_published_docs_host = match uri.authority() { |
| Some(a) => *a == PUBLISHED_DOCS_HOST, |
| None => false, |
| }; |
| |
| let parts = uri.path().split('/').collect::<Vec<&str>>(); |
| let project = parts[1]; |
| |
| /* |
| Links to gerrit source code should be to HEAD or refs/heads/main or equivalent. |
| The links can also not be to unknown or obsolete projects. |
| */ |
| if link_to_fuchsia_gerrit_host { |
| if !VALID_PROJECTS.contains(&project) { |
| return Some(DocCheckError::new_error( |
| doc_line.line_num, |
| doc_line.file_name.clone(), |
| &format!("Obsolete or invalid project {}: {}", project, uri), |
| )); |
| } |
| if !on_gerrit_master(uri) && project == project_being_checked { |
| let branch_index = parts.iter().position(|x| *x == "+").unwrap(); |
| let file_path = |
| if parts[branch_index + 1] == "refs" && parts[branch_index + 2] == "heads" { |
| parts[branch_index + 4..].join("/") |
| } else { |
| parts[branch_index + 2..].join("/") |
| }; |
| |
| //Possible point of discussion: Non-HEAD links are open discussion for non- //docs links. |
| |
| // Allow files that are not markdown, such as OWNERS to be links. |
| if parts.contains(&"docs") && uri.path().ends_with(".md") { |
| return Some(DocCheckError::new_error_helpful( |
| doc_line.line_num, |
| doc_line.file_name.clone(), |
| &format!("Invalid link to non-main branch: {}", uri), |
| &format!("filepath: /{}", file_path), |
| )); |
| } |
| } |
| } |
| |
| /* |
| Links to fuchsia.dev (where the docs are published) should not use http(s):, but |
| rather use relative paths from the fuchsia source root. For example, /docs/some/file.md. |
| The some projects in fuchsia.dev are allowed to be directly linked since they are not |
| part of the markdown checked in. |
| |
| There is also a list of exceptions of files that can link via https, these allow gtiles to |
| point to fuchsia.dev, and could be removed at some point in the future. |
| */ |
| if link_to_published_docs_host && !PUBLISHED_LINKS_ALLOWED.contains(&project) { |
| let base_name = doc_line |
| .file_name |
| .file_name() |
| .unwrap_or_else(|| OsStr::new("")) |
| .to_string_lossy() |
| .to_string(); |
| if !FILES_ALLOWED_TO_LINK_TO_PUBLISHED_DOCS.contains(&base_name.as_str()) { |
| // If the link is to the published docs directory (fuchsia-src), then |
| // a path should be used instead. |
| if parts.contains(&"fuchsia-src") { |
| return Some(DocCheckError::new_error( |
| doc_line.line_num, |
| doc_line.file_name.clone(), |
| &format!( |
| "Should not link to {} via {}, use relative filepath", |
| uri, |
| uri.scheme_str().unwrap_or_default() |
| ), |
| )); |
| } |
| } |
| } |
| None |
| } |
| |
| /// Checks whether the URI points to the master branch of a Gerrit (i.e., |
| /// googlesource.com) project. |
| fn on_gerrit_master(uri: &Uri) -> bool { |
| let path_segments: Vec<&str> = |
| uri.path().split('/').skip_while(|p| p != &"+").skip(1).take(3).collect(); |
| if path_segments.is_empty() { |
| // no + branch spec in the URL, so defaults to main. |
| return true; |
| } |
| // Links to gerrit are of the form: |
| // https://fuchsia.googlesource.com/fuchsia/+/refs/heads/main/docs/README.md |
| // https://fuchsia.googlesource.com/<project>/+/refs/heads/<branch>/<path> |
| |
| // path_segments is the path (everything after the server name) split on / |
| // branch_index is the index of the + in the path. |
| |
| match path_segments[0] { |
| "master" | "main" | "HEAD" => true, |
| "refs" => { |
| // There can be refs/ to other things, so make sure there are at least 3 segments after |
| // the +. Then join these together for comparison. |
| if 3 < path_segments.len() { |
| let ref_path = path_segments[1..4].join("/"); |
| return ref_path == "refs/heads/main" || ref_path == "refs/heads/master"; |
| } |
| false |
| } |
| _ => false, |
| } |
| } |
| |
| fn normalize_intree_path(filepath: &str) -> Result<PathBuf> { |
| let orig = PathBuf::from(filepath); |
| let mut normalized = PathBuf::new(); |
| let segments = orig.components(); |
| |
| for part in segments { |
| match part { |
| std::path::Component::Prefix(p) => { |
| // Prefix is used on Windows systems and is |
| // part of the path that is not part of normalizing |
| // the path. |
| // For Non-windows, it should not appear. |
| eprintln!("Unexpected path component {:?}", p); |
| } |
| std::path::Component::RootDir => { |
| //RootDir is the beginning of the Path, after |
| // any Prefix. |
| normalized.push("/"); |
| } |
| std::path::Component::CurDir => { |
| //CurDir is the current directory, "." |
| // it is ignored. |
| } |
| std::path::Component::ParentDir => { |
| // ParentDir is the parent of the current item, ".." |
| if !normalized.pop() { |
| bail!("Cannot normalize {}, references parent beyond root.", filepath); |
| } |
| } |
| std::path::Component::Normal(p) => normalized.push(p), |
| } |
| } |
| Ok(normalized) |
| } |
| |
| pub async fn check_external_links(links: &Vec<LinkReference>) -> Option<Vec<DocCheckError>> { |
| // sort the links to take advantage of keep alive |
| // HashMap is <authority, set<links>. |
| let mut domain_sorted_links = HashMap::<String, HashSet<&LinkReference>>::new(); |
| let mut errors = vec![]; |
| for link in links { |
| match link.link.parse::<Uri>() { |
| Ok(uri) => { |
| if let Some(authority) = uri.authority() { |
| let key = authority.to_string(); |
| |
| let set = domain_sorted_links.entry(key).or_default(); |
| set.insert(link); |
| } else { |
| errors.push(DocCheckError::new_error( |
| link.location.line_num, |
| link.location.file_name.clone(), |
| &format!("Error parsing {}: no authority found", link.link), |
| )); |
| } |
| } |
| Err(e) => errors.push(DocCheckError::new_error( |
| link.location.line_num, |
| link.location.file_name.clone(), |
| &format!("Error parsing {}: {}", link.link, e), |
| )), |
| }; |
| } |
| |
| let client: HttpsClient = new_https_client_from_tcp_options(tcp_options()); |
| for (authority, links) in domain_sorted_links { |
| let mut pending_requests = vec![]; |
| println!("checking {authority} {link_count} links", link_count = links.len()); |
| for link in links { |
| let p = check_url_link(client.clone(), link); |
| pending_requests.push(p); |
| } |
| let results = futures::future::join_all(pending_requests); |
| (results.await).into_iter().flatten().for_each(|e| errors.push(e)); |
| } |
| |
| if errors.is_empty() { |
| None |
| } else { |
| Some(errors) |
| } |
| } |
| |
| /// Check that the URL is valid (200 or 301 or 302). |
| async fn check_url_link(client: HttpsClient, link: &LinkReference) -> Option<DocCheckError> { |
| let request = match Request::get(&link.link).body(Body::from("")) { |
| Ok(request) => request, |
| Err(e) => { |
| return Some(DocCheckError::new_error( |
| link.location.line_num, |
| link.location.file_name.clone(), |
| &format!("Error {} requesting {}", e, link.link), |
| )) |
| } |
| }; |
| |
| match client.request(request).await { |
| Ok(response) => match response.status() { |
| StatusCode::OK | StatusCode::FOUND | StatusCode::MOVED_PERMANENTLY => None, |
| _ => Some(DocCheckError::new_error( |
| link.location.line_num, |
| link.location.file_name.clone(), |
| &format!("Error response {} reading {}", response.status(), &link.link), |
| )), |
| }, |
| Err(e) => Some(DocCheckError::new_error( |
| link.location.line_num, |
| link.location.file_name.clone(), |
| &format!("Error {} reading {}", e, link.link), |
| )), |
| } |
| } |
| |
| fn tcp_options() -> TcpOptions { |
| let mut options: TcpOptions = std::default::Default::default(); |
| |
| // Use TCP keepalive to notice stuck connections. |
| // After 60s with no data received send a probe every 15s. |
| options.keepalive_idle = Some(std::time::Duration::from_secs(60)); |
| options.keepalive_interval = Some(std::time::Duration::from_secs(15)); |
| // After 8 probes go unacknowledged treat the connection as dead. |
| options.keepalive_count = Some(8); |
| |
| options |
| } |
| |
| /// Called from main to register all the checks to preform which are implemented in this module. |
| pub(crate) fn register_markdown_checks(opt: &DocCheckerArgs) -> Result<Vec<Box<dyn DocCheck>>> { |
| let checker = LinkChecker { |
| root_dir: opt.root.clone(), |
| project: opt.project.clone(), |
| docs_folder: opt.docs_folder.clone(), |
| check_remote_links: !opt.local_links_only, |
| links: vec![], |
| }; |
| Ok(vec![Box::new(checker)]) |
| } |
| |
| #[cfg(test)] |
| mod tests { |
| use super::*; |
| use crate::DocContext; |
| |
| #[test] |
| fn test_make_link_to_check() -> Result<()> { |
| let checker = LinkChecker { |
| root_dir: PathBuf::from("/my/root/fuchsia"), |
| project: "fuchsia".to_string(), |
| docs_folder: PathBuf::from("docs"), |
| check_remote_links: false, |
| links: vec![], |
| }; |
| let filename = PathBuf::from("/my/root/fuchsia/docs/index.md"); |
| |
| let test_data = [ |
| ("README.md", "/docs/README.md"), |
| ("https://my-server.com", "https://my-server.com/"), |
| ( |
| "http://my-server.com/page?qp=1&words=one two three", |
| "http://my-server.com/page?qp=1&words=one%20two%20three", |
| ), |
| ("/docs/some-file.md", "/docs/some-file.md"), |
| ("#Anchor-name", "/docs/index.md#Anchor-name"), |
| ("path/to/sub/info.md", "/docs/path/to/sub/info.md"), |
| ("mailto:someone@somewhere.tld", "mailto:someone@somewhere.tld"), |
| ("https:///bad-url?x=", "https:///bad-url?x="), |
| ("/reference/to/generated.md", "https://fuchsia.dev/reference/to/generated.md"), |
| ]; |
| |
| for (input, expected) in test_data { |
| let actual = |
| checker.make_link_to_check(filename.as_path(), &CowStr::Borrowed(input))?; |
| |
| assert_eq!(actual, expected) |
| } |
| Ok(()) |
| } |
| |
| #[test] |
| fn test_normalize_intree_path() -> Result<()> { |
| let test_data = [ |
| ("/docs", PathBuf::from("/docs")), |
| ("/docs/../docs", PathBuf::from("/docs")), |
| ("/docs/sub/location.md", PathBuf::from("/docs/sub/location.md")), |
| ("/docs/sub/two/../location.md", PathBuf::from("/docs/sub/location.md")), |
| ("/docs/sub/./location.md", PathBuf::from("/docs/sub/location.md")), |
| ]; |
| |
| for (data, expected) in test_data { |
| let actual = normalize_intree_path(data)?; |
| assert_eq!(actual, expected); |
| } |
| |
| Ok(()) |
| } |
| |
| #[test] |
| fn test_happy_path() -> Result<()> { |
| let opt = DocCheckerArgs { |
| root: PathBuf::from("/path/to/fuchsia"), |
| project: "fuchsia".to_string(), |
| docs_folder: PathBuf::from("docs"), |
| local_links_only: true, |
| check_reference_links: false, |
| }; |
| |
| let mut checks = register_markdown_checks(&opt)?; |
| assert_eq!(checks.len(), 1); |
| |
| let ctx = DocContext::new( |
| PathBuf::from("/docs/README.md"), |
| "This is a line to [something](/docs/something.md", |
| ); |
| |
| if let Some(check) = checks.first_mut() { |
| for ele in ctx { |
| let errors = check.check(&ele)?; |
| assert!(errors.is_none(), "expected none, got {:?}", errors); |
| } |
| } |
| |
| Ok(()) |
| } |
| |
| #[test] |
| fn test_is_in_tree_link() -> Result<()> { |
| let root_dir = "/some/root/dir"; |
| let project = "fuchsia"; |
| let docs_folder = PathBuf::from("docs"); |
| |
| let test_cases = [ |
| ("/docs/README.md", Some(PathBuf::from("/docs/README.md"))), |
| ("/docs/somewhere/file.md#header1", Some(PathBuf::from("/docs/somewhere/file.md"))), |
| ("https://google.com", None), |
| ("mailto:someone@email.com", None), |
| ("/src/to/a/program.cc", Some(PathBuf::from("/src/to/a/program.cc"))), |
| ("https://fuchsia.googlesource.com/fuchsia/+/HEAD/sdk/lib/fdio", None), |
| ("https://fuchsia.googlesource.com/fuchsia/docs/README.md", Some(PathBuf::from("/fuchsia/docs/README.md"))), |
| ("https://fuchsia.googlesource.com/fuchsia/+/7461d8882167e7a9d1b494e3b1734d2c063830fc/build/package.gni#604", None), |
| ("https://fuchsia.googlesource.com/fuchsia/+show/HEAD/docs/concepts/kernel/_toc.yaml", Some(PathBuf::from("/docs/concepts/kernel/_toc.yaml"))), |
| ("https://fuchsia.googlesource.com/fuchsia", None), |
| ("https://fuchsia.googlesource.com/fuchsia/", None), |
| // Since this is not to the /docs dir, it should not be an in-tree link. |
| ("https://fuchsia.googlesource.com/fuchsia/+log/d381548c6aef76926e6203a2ad2265dd510d1e9b", None) |
| |
| ]; |
| for (link_to_check, expected) in test_cases { |
| let result = is_intree_link(project, root_dir, &docs_folder, link_to_check)?; |
| assert_eq!(result, expected); |
| } |
| Ok(()) |
| } |
| |
| #[test] |
| fn test_check() -> Result<()> { |
| let opt = DocCheckerArgs { |
| root: PathBuf::from("/path/to/fuchsia"), |
| project: "fuchsia".to_string(), |
| docs_folder: PathBuf::from("docs"), |
| local_links_only: true, |
| check_reference_links: false, |
| }; |
| |
| let mut checks = register_markdown_checks(&opt)?; |
| assert_eq!(checks.len(), 1); |
| |
| let test_data: Vec<(DocContext<'_>, Option<Vec<DocCheckError>>)> = vec", |
| ), |
| None, |
| ), |
| ( |
| DocContext::new( |
| PathBuf::from("/docs/README.md"), |
| "invalid image text ", |
| ), |
| Some( |
| [DocCheckError::new_error(1, PathBuf::from("/docs/README.md"), |
| "Invalid image alt text: \"\", cannot be one of [\"\"]"), |
| DocCheckError::new_error(1,PathBuf::from("/docs/README.md"), |
| "in-tree link to /docs/something.png could not be found at \"/path/to/fuchsia/docs/something.png\"")].to_vec()), |
| ), |
| ( |
| DocContext::new( |
| PathBuf::from("/docs/README.md"), |
| "invalid url [oops](https:///nowhere/something.md?xx)", |
| ), |
| Some([DocCheckError::new_error(1, PathBuf::from("/docs/README.md"), |
| "Invalid link https:///nowhere/something.md?xx : invalid format")].to_vec()) |
| ), |
| ( |
| DocContext::new( |
| PathBuf::from("/docs/README.md"), |
| "relative path outside root [oops](/docs/../../illegal.md)", |
| ), |
| Some([DocCheckError::new_error(1,PathBuf::from("/docs/README.md"), |
| "Cannot normalize /docs/../../illegal.md, references parent beyond root.")].to_vec()) |
| ), |
| ( |
| DocContext::new( |
| PathBuf::from("/docs/README.md"), |
| "hl param is not allowed [hl](https://google.com/something?hl=en)", |
| ), |
| None, |
| ), |
| ( |
| DocContext::new( |
| PathBuf::from("/docs/README.md"), |
| "invalid project link [garnet](https://fuchsia.googlesource.com/garnet/+/HEAD/src/file.cc)", |
| ), |
| Some([DocCheckError::new_error(1, PathBuf::from("/docs/README.md"), |
| "Obsolete or invalid project garnet: https://fuchsia.googlesource.com/garnet/+/HEAD/src/file.cc")].to_vec()) |
| ), |
| ( |
| DocContext::new( |
| PathBuf::from("/docs/README.md"), |
| "A reference link to [`topaz`][flutter-gni]\n\n\ |
| [flutter-gni]: https://fuchsia.googlesource.com/topaz/+/HEAD/runtime/flutter_runner/flutter_app.gni \"Flutter GN build template\"" |
| ), |
| Some([DocCheckError::new_error(1, PathBuf::from("/docs/README.md"), |
| "Obsolete or invalid project topaz: https://fuchsia.googlesource.com/topaz/+/HEAD/runtime/flutter_runner/flutter_app.gni")].to_vec()) |
| ), |
| ( |
| DocContext::new( |
| PathBuf::from("/docs/README.md"), |
| "non-master branch link to docs [old doc](https://fuchsia.googlesource.com/fuchsia/+/some-branch/docs/file.md)", |
| ), |
| Some([DocCheckError::new_error_helpful(1,PathBuf::from("/docs/README.md"), |
| "Invalid link to non-main branch: https://fuchsia.googlesource.com/fuchsia/+/some-branch/docs/file.md","filepath: /docs/file.md")].to_vec()) |
| ), |
| ( |
| DocContext::new( |
| PathBuf::from("/docs/README.md"), |
| "non-master branch link to src ok [old source](https://fuchsia.googlesource.com/fuchsia/+/some-branch/tools/file.cc)", |
| ), |
| None, |
| ), |
| ( |
| DocContext::new( |
| PathBuf::from("/docs/README.md"), |
| "non-markdown file OK to link to docs [non-source](https://fuchsia.googlesource.com/fuchsia/+/refs/heads/main/docs/OWNERS)", |
| ), |
| None, |
| ) |
| ]; |
| |
| for (ctx, expected_errors) in test_data { |
| for ele in ctx { |
| let errors = checks[0].check(&ele)?; |
| if let Some(ref expected_list) = expected_errors { |
| let mut expected_iter = expected_list.iter(); |
| if let Some(actual_errors) = errors { |
| for actual in actual_errors { |
| if let Some(expected) = expected_iter.next() { |
| assert_eq!(&actual, expected); |
| } else { |
| panic!("Got unexpected error returned: {:?}", actual); |
| } |
| } |
| let unused_errors: Vec<&DocCheckError> = expected_iter.collect(); |
| if !unused_errors.is_empty() { |
| panic!("Expected more errors: {:?}", unused_errors); |
| } |
| } else if expected_errors.is_some() { |
| panic!("No errors, but expected {:?}", expected_errors); |
| } |
| } else if errors.is_some() { |
| panic!("Got unexpected errors {:?}", errors.unwrap()); |
| } |
| } |
| } |
| |
| Ok(()) |
| } |
| |
| #[test] |
| fn test_do_intree_check() -> Result<()> { |
| let doc_line = DocLine { line_num: 1, file_name: PathBuf::from("some/file.md") }; |
| let root_dir = PathBuf::from("/path/to/fuchsia"); |
| let docs_folder = PathBuf::from("docs"); |
| |
| let test_data = [ |
| ("/docs/exists/something.md", "/docs/exists/something.md", None), |
| ("/docs/no_readme", "/docs/no_readme", Some(DocCheckError::new_error( |
| 1, PathBuf::from("some/file.md"), |
| "in-tree link to /docs/no_readme could not be found at \"/path/to/fuchsia/docs/no_readme\" or \"/path/to/fuchsia/docs/no_readme/README.md\"")))]; |
| |
| for (link_to_check, in_tree_path, expected_error) in test_data { |
| let result = do_in_tree_check( |
| &doc_line, |
| &root_dir, |
| &docs_folder, |
| link_to_check, |
| &PathBuf::from(in_tree_path), |
| ); |
| assert_eq!(result, expected_error); |
| } |
| Ok(()) |
| } |
| |
| #[test] |
| fn test_check_reference_links() -> Result<()> { |
| let opt = DocCheckerArgs { |
| root: PathBuf::from("/path/to/fuchsia"), |
| project: "fuchsia".to_string(), |
| docs_folder: PathBuf::from("docs"), |
| local_links_only: true, |
| check_reference_links: true, |
| }; |
| |
| let mut checks = register_markdown_checks(&opt)?; |
| assert_eq!(checks.len(), 1); |
| |
| let test_data: Vec<(DocContext<'_>, Option<Vec<DocCheckError>>)> = vec", |
| true |
| ), |
| None, |
| ), |
| ( |
| DocContext::new_with_checks( |
| PathBuf::from("/docs/README.md"), |
| "invalid url [oops](https:///nowhere/something.md?xx)", |
| true |
| ), |
| Some([DocCheckError::new_error(1, PathBuf::from("/docs/README.md"), |
| "Invalid link https:///nowhere/something.md?xx : invalid format")].to_vec()) |
| ), |
| ( |
| DocContext::new_with_checks( |
| PathBuf::from("/docs/README.md"), |
| "A reference link to [`topaz`][flutter-gni]\n\n\ |
| [flutter-gni]: https://fuchsia.googlesource.com/topaz/+/HEAD/runtime/flutter_runner/flutter_app.gni \"Flutter GN build template\"", |
| true |
| ), |
| Some([DocCheckError::new_error(1, PathBuf::from("/docs/README.md"), |
| "Obsolete or invalid project topaz: https://fuchsia.googlesource.com/topaz/+/HEAD/runtime/flutter_runner/flutter_app.gni")].to_vec()) |
| ), |
| ( |
| DocContext::new_with_checks( |
| PathBuf::from("/docs/README.md"), |
| "brackets which are not a link [your name here]", |
| true |
| ), |
| Some([DocCheckError::new_info_helpful(1, PathBuf::from("/docs/README.md"), |
| "unescaped [your name here] not treating this as a shortcut link.", |
| "escaped \\[your name here\\] or make a link [your name here](/docs/your name here") |
| ].to_vec()), |
| ), |
| ( |
| DocContext::new_with_checks( |
| PathBuf::from("/docs/README.md"), |
| "missing [text][link-to-text]", true), |
| Some([DocCheckError::new_error_helpful(1, PathBuf::from("/docs/README.md"), |
| "Unknown reference link to [text][link-to-text]", |
| "making sure you added a matching [link-to-text]: YOUR_LINK_HERE below this reference" |
| )].to_vec()) |
| ), |
| ( |
| DocContext::new_with_checks( |
| PathBuf::from("/docs/README.md"), |
| r#"pw_toolchain_STATIC_ANALYSIS_SKIP_INCLUDE_PATHS = [".*/third_party/.*"]"#, |
| true |
| ), |
| None |
| ), |
| ]; |
| |
| for (ctx, expected_errors) in test_data { |
| for ele in ctx { |
| let errors = checks[0].check(&ele)?; |
| if let Some(ref expected_list) = expected_errors { |
| let mut expected_iter = expected_list.iter(); |
| if let Some(actual_errors) = errors { |
| for actual in actual_errors { |
| if let Some(expected) = expected_iter.next() { |
| assert_eq!(&actual, expected); |
| } else { |
| panic!("Got unexpected error returned: {:?}", actual); |
| } |
| } |
| let unused_errors: Vec<&DocCheckError> = expected_iter.collect(); |
| if !unused_errors.is_empty() { |
| panic!("Expected more errors: {:?}", unused_errors); |
| } |
| } else if expected_errors.is_some() { |
| panic!("No errors, but expected {:?}", expected_errors); |
| } |
| } else if errors.is_some() { |
| panic!("Got unexpected errors {:?}", errors.unwrap()); |
| } |
| } |
| } |
| |
| Ok(()) |
| } |
| } |