Operand

odyssey, u hear?

gram: nue

> ./domain.nu

Lenses
(coming soon!)


source ~/.config/nushell/nix.nu
source ~/.config/nushell/grammar.nu
source ~/.config/nushell/day.nu

# Domain clone commands:
# use on bulk records, displayed in simple html page hierarchies.
#
# Procedures:

# > domain addrs pull static.case.law

# > domain addrs describe static.case.law
#   # here, you can choose the suffices you need.

# > domain addrs static.case.law | first 4

# > domain sums pull --hash sha256 # or md5, or so on. depends on domain.

# pull all pages
# > domain pages pull static.case.law
# pull pages according to chosen suffixes
# > domain pages pull static.case.law -s [ html tar pdf ]

def "domain base" []: any -> string { "~/domain" | path expand | tee { mkcd } }

# Locus on disc of domain's address index.
def "domain addrs index" [ domain: string, ] {
  [ (domain base) ($domain | str replace -a '.' '_')]
  | path join | path expand
}

def "domain addrs" [
  domain: string,
  --suffix (-s): list<string>,
] {
  let addrs = (
  cat (domain addrs index $domain)
  | lines
  | each {|l|
    mut line = $l
    mut labels = []
    mut len = 0

    while ($line | str length) != $len {
      $len = $line | str length
      let parse = $line | addr spider label
      if ($parse | is-not-empty) {
        # print $"parsed '($line)' -> '($parse)'"
        $labels = $labels ++ [$parse.capture0.0]
        $line = $parse.capture1.0
    } }

    { addr: $line, labels: $labels }
  })

  if ($suffix | is-empty) { $addrs } else {
    $addrs | where {
    ($in.addr | path parse | get extension)
    in $suffix } }
}

def "addr spider label" [] {
# : string -> record<label: string, remain: string>
  $in | parse -r '\[([\-\w]+)\] - (.+)'
}

#
def "domain addrs pull" [
  domain: string,
] {
  ( nsh gospider gospider
    -s $"https://($domain)/"
    -o (domain base)
    -v
    # --whitelist "\/$"
    --whitelist-domain $domain
    --include-subs
    -c 40
    -d 0
  ); domain addrs index $domain
}

# See numbers of domain addresses,
# per path extension.
def "domain addrs describe" [
  domain: string,
] {
  domain addrs $domain
  | get addr
  | path parse
  | group-by extension
  | transpose shape nodes
  | each { { shape: $in.shape, num: ($in.nodes | length) } }
}

# Scan all domain addresses ending in a hash checksum extension,
# pulling all sums into a local file,
# in batches (normally 1000).
# ---
# Use when a static file domain
# includes `.md5`, `.sha256`, `.sha512`, or so on
# as route suffixes.
def "domain sums pull" [
  domain: string,
  --hash (-h): string = sha256,
  --number (-n): int = 1000,
] {
  print $"loading index @ (domain addrs index $domain);"
  let full = open -r (domain addrs index $domain) | lines | length
  print $"($full) lines..."

  let sum_addresses = domain addrs $domain
    | where { ($in | path parse | get extension) == $hash } # 2min 8sec 486ms 693µs 116ns
  # | where { ($in | split row '.' | last) == $hash }       # 2min 15sec 424ms 833µs 767ns
  # | where { ($in | str replace -r '.+\.' '') == $hash }   # 2min 43sec 611ms 12µs 759ns

  mut begin = -1
  mut end = domain sums $domain | length

  print $"loaded ($sum_addresses | length) hash addresses."
  print $"($end) already seen."
  while ($end != $begin) {
    $begin = $end
    let seen = domain sums $domain -h $hash | get addr

    $sum_addresses
    | where {|l| not ($l.addr in $seen) }
    | first $number
    | par-each {|a| $a
      | insert $hash { retry 4 { http get $a.addr } }
      | select $hash addr
      | to csv -n
      | save -a (domain sums index $domain -h $hash)
    }
    $end = domain sums $domain | length # num of lines in (domain sum index $domain)
    print $"(clock) / pulled ($number);\t($begin) -> ($end) / ($full)"
  }
}

# Load all domain checksum hashes in a domain.
# [upgrade] scan for any addr ending in a common hash sequence
# [upgrade] ensure each response is one line, or process may break!
def "domain sums" [
  domain: string,
  --hash (-h): string = sha256,
] {
  try { open -r (domain sums index $domain -h $hash) | from csv --noheaders } catch {[]}
  | rename $hash address
}

def "domain sums index" [
  domain: string,
  --hash (-h): string = sha256,
] {
  [ (domain base) $"($domain).($hash).index.csv" ]
  | path join | path expand
}

# ---
# Pages; in progress.
# ---

def "domain pages pull" [
  domain: string,
  --suffix (-s): list<string>,
  --number (-n): int = 400,
] {
  print $"loading index @ (domain addrs index $domain);"
  if ($suffix | is-not-empty) {
    print $"- suffix: ($suffix)" } else {
    print "- any suffix" }

  let addrs = domain pages local $domain
  print $"($addrs | length) lines..."

  mut begin = -1
  mut end = $addrs | length

  # print $"loaded ($addrs | length) hash addresses."
  # print $"($end) already seen."
  # while ($end != $begin) {
  #   $begin = $end
  #   let seen = domain sums $domain -h $hash | get addr

  #   $addrs
  #   | where {|l| not ($l.address in $seen) }
  #   | first $number
  #   | par-each {|a|
  #     $a | insert $hash { retry 4 { http get $a.address } }
  #     | select $hash address
  #     | to csv -n | save -a (domain sums index $domain -h $hash)
  #   }
  #   $end = domain sums $domain | length
  #   print $"pulled ($number);\t($begin) -> ($end) / ($full)"
  # }
}

def "domain pages local" [
  domain: string,
  --suffix (-s): list<string>
] {
  let addrs = domain addrs $domain -s $suffix | get addr
  print $"pulling ($addrs | length) pages..."
}