blob: 644156955f9feff5da39fcdb0e5fbce86f800357 [file] [log] [blame]
* Copyright 2018 The Fuchsia Authors. All rights reserved.
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
import { EntityExtractor, Entity } from "./entity";
import { toArray } from "./util";
const asciiSpaces = /[ \r\n\t\f]+/;
export class MicrodataExtractor implements EntityExtractor {
public extract(document: HTMLDocument): Entity[] {
const entities: Entity[] = [];
for (const node of toArray(document.querySelectorAll('*[itemscope]:not([itemprop])'))) {
entities.push(this.extractEntity(node, []));
return entities;
* Returns true if the target element is the direct child from a microdata perspective.
* @param element The target element
* @param root The root element
private directChild(element: Element, root: Element): boolean {
for (let e = element.parentElement; e; e = e.parentElement) {
if (e == root) {
return true;
if (e.hasAttribute('itemscope')) {
return false;
return false;
private itemRefs(root: Element): string[] {
// Find itemrefs on the root.
const itemrefs = (root.getAttribute('itemref') || "").split(asciiSpaces);
// Find itemrefs on the children.
for (const e of toArray(root.querySelectorAll('[itemref]')).filter(r => this.directChild(r, root))) {
for (const i of e.getAttribute('itemref')!.split(asciiSpaces)) {
return itemrefs;
private itemProperties(root: Element): Element[] {
// Get all the itemprop children...
const props = toArray(root.querySelectorAll('[itemprop]:not([itemprop=""])'))
// that are direct children...
.filter(prop => this.directChild(prop, root));
if (root.hasAttribute('itemscope') && root.hasAttribute('itemref')) {
for (const itemref of root.getAttribute('itemref')!.split(asciiSpaces)) {
const element = document.getElementById(itemref);
if (!element) {
for (const prop of this.itemProperties(element)) {
return props;
private propertyValue(element: Element, memory: Element[]): Entity | string {
memory = memory || [];
if (memory.indexOf(element) != -1) {
return 'ERROR';
if (element.hasAttribute('itemscope')) {
return this.extractEntity(element, memory);
if (element.hasAttribute('content')) {
return element.getAttribute('content')!;
if (element instanceof HTMLAudioElement ||
element instanceof HTMLEmbedElement ||
element instanceof HTMLIFrameElement ||
element instanceof HTMLImageElement ||
element instanceof HTMLSourceElement ||
element instanceof HTMLTrackElement ||
element instanceof HTMLVideoElement) {
return element.src || "";
if (element instanceof HTMLAnchorElement ||
element instanceof HTMLAreaElement ||
element instanceof HTMLLinkElement) {
return element.href || "";
if (element instanceof HTMLMeterElement ||
element instanceof HTMLDataElement) {
if (element.hasAttribute('value')) {
return element.getAttribute('value')!;
} else if (element instanceof HTMLTimeElement) {
//} else if (element.tagName == 'TIME') {
if (element.hasAttribute('datetime')) {
return element.getAttribute('datetime')!;
return element.textContent || "";
private extractEntity(item: Element, memory: Element[]): Entity {
// 1. Let result be an empty object.
let result: Entity = {};
// 2. If no memory was passed to the algorithm, let memory be an empty
// list.
memory = memory || [];
// 3. Add item to memory.
// 4. If the item has any item types, add an entry to result called
// "@type" whose value is an array listing the item types of item, in
// the order they were specified on the itemtype attribute.
const itemTypes = item.getAttribute('itemtype');
if (itemTypes) {
result['@type'] = itemTypes.split(asciiSpaces);
// 5. If the item has a global identifier, add an entry to result called
// "@id" whose value is the global identifier of item.
if (item.hasAttribute('itemid')) {
result['@id'] = item.getAttribute('itemid')!;
// 7. For each element element that has one or more property names and is
// one of the properties of the item item, in the order those elements
// are given by the algorithm that returns the properties of an item,
// run the following substeps:
for (const element of this.itemProperties(item)) {
const value = this.propertyValue(element, memory);
for (const name of element.getAttribute('itemprop')!.split(asciiSpaces)) {
if (!result[name]) {
result[name] = [];
(result[name] as Array<string | Entity>).push(value);
return result;
private isMicrodataRelated(node: Node): boolean {
if (node instanceof Element) {
return node.hasAttribute('itemscope')
|| node.hasAttribute('itemprop')
|| node.hasAttribute('itemtype')
|| node.hasAttribute('itemid')
|| node.hasAttribute('itemref');
if (node.parentElement) {
return this.isMicrodataRelated(node.parentElement);
return false;
public entitiesChanged(records: MutationRecord[]): boolean {
for (const record of records) {
if (record.type === "childList") {
for (const node of toArray(record.addedNodes)) {
if (this.isMicrodataRelated(node)) {
return true;
for (const node of toArray(record.removedNodes)) {
if (this.isMicrodataRelated(node)) {
return true;
} else if (record.type === "attributes") {
// TODO: if oldValue means target is no longer related...
// TODO: if id changes to itemref moves...
return this.isMicrodataRelated(;
} else if (record.type === "characterData") {
return this.isMicrodataRelated(;
return false;