• Operand
  • One point!? Oh.

gram: build

> ./config/nushell/domain.nu

Lenses
(coming soon!)


source ./grammar.nu

# Map an online address to local disc.
# In case of dir URLS (ending in `/`),
# append `index.html`
def "domain map" [addr: string, --base (-b): path] {
  let call = $addr | url parse
  let base = if not ($base | is-empty) { $base } else {
    [ ~ # brea cache
    domain $call.host ] | path join
  } | path expand
  if (($base | path type) != 'dir') { mkdir $base }
  let node = $call.path
  let node = if ($node | str ends-with "/") { $node + "index.html" } else { $node }
  let node = if ($node | str starts-with "/") { $node | str substring 1.. } else { $node }
  [ $base $node ] | path join | path expand
}

# Pull and cache an online address,
# or read a copy already cached on disc.
def "domain page" [
  addr: string,
  --dura (-d): duration = 0sec,
] {
  let node = domain map $addr
  if ($node | path type) == file { open $node
  } else {
    mkdir ($node | path dirname)
    retry 3 { http get $addr | tee { save -f $node;
      print $"(day) | (cloc) // ($node)"; sleep $dura } }
  }
}

# Cache and read an online page by address,
# mine link addresses.
def "domain links" [
  addr: string,
  --dura (-d): duration = 0sec,
] {
  (domain page $addr -d $dura) | pup 'a attr{href}' | lines }

# Using a `queue` of addresses,
# pull each one and mine page links,
# before marking as `seen`;
# incurs a spin-up delay
# in exchange for idempotency and completeness.
def "domain index" [
  --dura (-d): duration = 2sec,
  --seen (-s): list<string> = [],
  ...queue: string,
] {
  mut q = $queue; mut s = $seen; mut prune = 0;
  let domains = $queue | each { url parse } | get host
  print $"(day) | (cloc) // begin:"
    $queue | to yaml | print

  while ($q | length ) > 0 { let addr = $q | first

    # Unseen addresses
    if not ($addr in $s) {
      # Mine any HTML links
      if (domain map $addr | path parse | get extension) == html {
        let links = domain links $addr -d $dura |
          where { ($in | url parse | get host) in $domains } |
          where { not ($in in $s) }
        $q = $q ++ $links

      # ensure non-HTML on disc.
      } else if (domain map $addr | path type) != 'file' {
        domain page $addr }

      $s = $s ++ [$addr]
    }
    # Pop queue.
    $q = $q | slice 1..

    # Prune!
    if (($s | length) - $prune) >= 500 {
      let l = $q | length; $q = $q | uniq
      print $"(day) | (cloc) // seen ($s | length); prune queue: ($l) -> ($q | length)"
      $prune = $s | length
    }
} }