Signal drop!
Relay (operand.online) is unreachable.
Usually, a dropped signal means an upgrade is happening. Hold on!
Sorry, no connección.
Hang in there while we get back on track
gram: card
> ./src/site_adapters/domScrapingBase.ts
Lenses
(coming soon!)
'use strict';
import { urlContains, extractNumber, urlMatches } from "../utils";
import mapValues from "lodash/mapValues";
import keyBy from 'lodash/keyBy'
import { Attribute, SortConfig, TableAdapter, Table, RecordEdit } from '../core/types'
import { htmlToElement } from '../utils'
import { updateFromSetFormula } from '../end_user_scraper/eventListeners'
import { getCachedActiveAdapter, getCreatingAdapter } from "../end_user_scraper/state";
type DataValue = string | number | boolean
declare var browser : any;
// Todo:
// There are checks in the code for whether a PageValue is an element;
// e.g. for updating values in the page or for highlighting values in the page.
// A more principled way would be to use tagged unions and "pattern match".
// (although it's a bit annoying that we have to manually add tags in our
// runtime data to get this to work...)
// More info here: https://www.typescriptlang.org/docs/handbook/advanced-types.html#discriminated-unions
/** A data value extracted from the page.
* There are two options for specifying a value:
*
* * Element: You can specify a DOM element and Wildcard will extract its
* contents. If the column is writable, Wildcard will also replace the
* contents of the DOM element when the value is edited in the table.
* * [[DataValue]] You can run arbitrary code (e.g. regexes) to
* extract a value from the DOM and show it in the table.
* **Not compatible with editable columns.**
* Note on types: the data type specified in the colSpec will ultimately
* determine how the value gets displayed.
*/
type PageValue = Element | DataValue
/** A data structure representing a scraped row of data from the page.
* Must specify both an HTML element and an object containing data values.
* (The HTML element is used for things like highlighting and sorting rows.)
*/
export interface ScrapedRow {
/** The element(s) representing the row */
// Todo: change this to Array<HTMLElement> to make callers happy
// (kind of annoying because we want to allow adapter config to return
// Array<Element>, but then handle the typecasting to HTMLElement
// here in this file, and the TS bookkeeping is a bit messy)
rowElements: Array<Element>;
/** A stable ID for the row */
id: string;
/** The data values for the row, with column names as keys */
dataValues: { [key: string]: PageValue };
/** A container for adding user annotations */
annotationContainer?: HTMLElement;
/** An HTML template for storing an annotation on this row.
* should include "$annotation", which will be replaced by annotation text
*/
annotationTemplate?: string;
}
export interface ScrapedAjaxRow {
/** A stable ID for the row */
id: string;
/** The data values for the row, with column names as keys */
dataValues: { [key: string]: PageValue };
}
// todo: document this config;
// it's the main thing people need to understand to
// build a site adapter
export interface ScrapingAdapterConfig {
name:string;
matches?:string[];
metadata?: object;
enabled():boolean;
attributes:Array<Attribute>;
scrapePage():Array<ScrapedRow>;
scrapeAjax?(request):Array<ScrapedAjaxRow>;
addScrapeTriggers?(any):void;
iframe?:boolean;
/** Custom function to modify row elements when row is selected */
onRowSelected?(ScrapedRow, string?):void;
/** Custom function to modify row elements when row is unselected */
onRowUnselected?(ScrapedRow, string?):void;
}
function onDomReady(fn) {
if (document.readyState!='loading') fn();
else document.addEventListener('DOMContentLoaded', fn)
}
// Takes in as input a site-specific DOM scraping configuration;
// returns a TableAdapter that conforms to the abstract adapter spec.
export function createDomScrapingAdapter(config:ScrapingAdapterConfig):TableAdapter {
const attributes = config.attributes
// Mutable state to be managed for this adapter, as a closure
let scrapedRows: Array<ScrapedRow> = [];
let sortOrder: SortConfig = null;
let subscribers: Array<(Table) => void> = [];
let scrapedAjaxRows;
let scrapedAjaxRowDict = {};
//Only works for Firefox
if(navigator.userAgent.indexOf("Firefox") != -1 ) {
// Listen to AJAX Requests
browser.runtime.onMessage.addListener(request => {
let result = config.scrapeAjax(request);
if (result !== undefined && result !== null) {
scrapedAjaxRows = result;
result.forEach((item) => {
scrapedAjaxRowDict[item.id] = item.dataValues;
});
loadTable();
}
});
}
// todo: another way to store this would be to
// create some sort of wrapper type where we add more data to scraped rows
let originalBorder;
// Do some light cleanup on the result of the config's scrapePage.
// This is to make it easier on site adapter developers to
// avoid doing these steps in their scrapePage functions.
const scrapePage = () => {
let result = config.scrapePage();
if (!result) { return result; }
result = result.map(scrapedRow => (
{
...scrapedRow,
rowElements:
scrapedRow.rowElements
// Make typescript happy with these element types
.map(el => el as HTMLElement)
// Don't include a row element if it's not there
.filter(el => el)
}))
return result;
}
const loadTable = () => {
scrapedRows = scrapePage();
const table = { ...tableInExternalFormat(), attributes: config.attributes };
// Notify all subscribers that we have new data
for (const callback of subscribers) { callback(table); }
return table;
}
const initialize = () => {
onDomReady(() => {
loadTable();
if (config.addScrapeTriggers) {
config.addScrapeTriggers(loadTable);
}
})
}
// Currently, just load data once on dom ready
// todo: reload data on different triggers
const subscribe = (callback) => {
subscribers = [...subscribers, callback]
// trigger a load to catch up this subscriber
loadTable();
}
// Note about DOM scraping + sorting:
// We compute a sort order outside the adapter and pass in the ID list.
// When a sort is cleared, we revert to the original scraped order.
// This works fine if we only scrape once on page load.
// But if we scrape again during page load, while the page is sorted,
// things get weird -- the sorted order becomes our "unsorted" underlying
// order. Unfortunately not too much we can do about that.
const applySort = (finalRecords, sortConfig) => {
if (scrapedRows.length > 0) {
const rowContainer = scrapedRows[0].rowElements[0].parentElement;
const scrapedRowsById = keyBy(scrapedRows, r => r.id);
// todo: this clears out other non-row content, which isn't ideal.
// but there's not a super great way to keep it in the right place...
rowContainer.innerHTML = ""
finalRecords.forEach(r => {
let scrapedRow = scrapedRowsById[r.id];
scrapedRow.rowElements.forEach(el => {
rowContainer.appendChild(el);
})
})
}
}
// todo: does this API really need to be async...?
// we receive an edit request with all the attributes
// and split it up into two parts:
// updates to the original record in the page,
// and updates to additional user-added columns which become annotations
const editRecords = (edits:Array<RecordEdit>) => {
for (const { recordId, attribute, value } of edits) {
const scrapedRow = scrapedRows.find(sr => sr.id === recordId);
if (!scrapedRow) { continue; }
const scrapedValue = scrapedRow.dataValues[attribute];
if (!(scrapedValue instanceof HTMLElement)) {
return Promise.reject("Can't edit scraped value, site adapter must return HTML element to be editable.")
}
if (scrapedValue instanceof HTMLInputElement) {
scrapedValue.value = value;
} else {
scrapedValue.textContent = value;
}
}
return Promise.resolve(loadTable());
}
// the simplest thing to do is just handle a whole new table,
// and re-annotate everything on the page.
// if this is too slow, we can get smarter,
// eg only re-annotate certain rows.
const handleOtherTableUpdated = (newTable) => {
for (const record of newTable.records) {
const scrapedRow = scrapedRows.find(sr => sr.id === record.id);
if (!scrapedRow || !scrapedRow.annotationContainer) continue;
// todo: set a default annotation container + target.
// create the annotation target if it doesn't exist
let annotationTarget = scrapedRow.annotationContainer.querySelector(".user-annotations");
if (!annotationTarget) {
annotationTarget =
htmlToElement("<span class='user-annotations'></span>")
scrapedRow.annotationContainer.appendChild(annotationTarget)
}
// add the actual annotations to the page
let annotationsHTML =
newTable.attributes
.filter(attr => !attr.hideInPage) // hide columns hidden in page
.map(attr => record.values[attr.name])
.filter(value => value !== undefined)
.map(value => scrapedRow.annotationTemplate.replace("$annotation", value));
annotationTarget.innerHTML = annotationsHTML.join(" ")
}
}
// convert scraper-internal data structure to
// the standard format for all wildcard adapters
const tableInExternalFormat = ():Table => {
// hmm, don't love creating parallel data structures for values + elements;
// maybe better:
// * user constructs just elements
// * base DOMScrapingAdapter constructs elements + values
// * base DOMScrapingAdapter uses that to output the "just values" version
let combinedRows = scrapedRows.map(row => {
// console.log("id:",row.id, "length:", row.id.length);
row.dataValues = {...row.dataValues, ...scrapedAjaxRowDict[row.id]};
return row;
});
const records = combinedRows.map(row => ({
id: row.id,
values: mapValues(row.dataValues, (value, attrName) => {
let extractedValue;
// extract text from html element
if (value instanceof HTMLInputElement) {
extractedValue = value.value;
}
else if (value instanceof HTMLElement) {
// Return DOM Elements as-is to the table;
// we'll extract text contents at display time.
extractedValue = value;
} else {
extractedValue = value;
}
// do type conversions
if (attributes.length && attributes.find(spec => spec.name === attrName).type === "numeric" &&
typeof extractedValue !== "number") {
extractedValue = extractNumber(extractedValue);
}
return extractedValue;
})
}));
return {
tableId: "app",
attributes: attributes,
records: records
}
}
const otherTableUpdated = (table) => {
console.log("other table updated", table);
}
const addAttribute = () => {
return Promise.reject("Attributes can only be added to user tables.")
}
const updateAttribute = ({ attrName, formula }) => {
const attributeIndex = config.attributes.findIndex(attribute => attribute.name === attrName);
if (attributeIndex > -1) {
const attribute = config.attributes[attributeIndex];
if (attribute.formula !== formula) {
attribute.formula = formula;
return {
updated: true,
column: attributeIndex - 1
};
}
return {
updated: false
};
}
return {
updated: true
};
}
const toggleVisibility = (colName) => {
return Promise.reject("Visibility can only be toggled for user tables.")
}
// todo: support highlighting individual attributes
// within a row (generally when there's one row)
const handleRecordSelected = (recordId, attribute) => {
for (const sr of scrapedRows) {
// Handle highlighting
if (sr.id === recordId) {
if (config.onRowSelected) {
config.onRowSelected(sr);
} else {
// make the row appear selected in the page
sr.rowElements[0].scrollIntoView({ behavior: "smooth", block: "center" })
// sr.rowElements.forEach((el, index) => {
// if ((el as HTMLElement).style) {
// // oversimplified way to remember old styles before highlighting --
// // just remember a single original border value across all rows.
// // (works OK if all rows share same border styling)
// if (originalBorder === undefined) {
// originalBorder = (el as HTMLElement).style.border;
// }
// (el as HTMLElement).style.border = `solid 2px #c9ebff`
// }
// })
}
// Handle unhighlighting
} else {
if (config.onRowUnselected) {
config.onRowUnselected(sr);
} else {
sr.rowElements.forEach((el, index) => {
if ((el as HTMLElement).style && originalBorder !== undefined) {
(el as HTMLElement).style.border = originalBorder;
}
});
}
}
}
}
const enabled = () => {
const { matches } = config;
if (matches) {
const enabled = matches.some(match => {
return urlContains(match) || urlMatches(new RegExp(match))
});
return enabled;
} else if (config.enabled) {
return config.enabled();
}
return false;
}
return {
name: config.name,
enabled,
// todo: maybe have different table ids for each adapter,
// rather than make them all "app"?
tableId: "app",
loadTable,
initialize,
subscribe,
applySort,
editRecords,
handleOtherTableUpdated,
handleRecordSelected,
addAttribute,
toggleVisibility,
clear: () => {},
setFormula: (attrName, formula) => {
if (getCreatingAdapter()) {
const { updated, column } = updateAttribute({ attrName, formula });
if (updated) {
updateFromSetFormula({ formula, column });
}
}
},
updateConfig: (_config) => {
config.attributes = _config.attributes;
config.scrapePage = _config.scrapePage;
config.metadata = _config.metadata;
loadTable();
},
getConfig: () => {
return config;
}
}
}