Artifact 78d9c8a4732895440e3d4fc3a774ae992835f4b9:
- File
packages/dir/lib/fileset.tcl
— part of check-in
[fc513294dd]
at
2018-12-26 21:06:16
on branch trunk
— dir iter
fix bug in file type reporting
dir scan
add indexes for better performance
gryp archive
batch insertions to improve performance
(user: pooryorick size: 24770)
#! /bin/env tclsh package require sqlite3 package require Thread package require fileutil::magic::filetype 1.2- namespace import [yclprefix] if {[namespace tail [yclprefix]] ne {ycl}} { rename [namespace tail [yclprefix]] ycl } package require lexec package require {ycl chan interp interp} namespace import [yclprefix]::chan::interp::interp rename interp rinterp package require {ycl coro call} namespace import [yclprefix]::coro::call::hi namespace import [yclprefix]::coro::call::reply namespace import [yclprefix]::coro::call::routine package require {ycl chan diagnostic} namespace import [yclprefix]::chan::diagnostic package require {ycl dir} namespace import [yclprefix]::dir namespace import [yclprefix]::dir::invade package require {ycl file signature} namespace import [yclprefix]::file::signature package require {ycl list} namespace import [yclprefix]::list::add package require {ycl math rand} namespace import [yclprefix]::math::rand::randprint_256 package require {ycl eav sqlite} package require {ycl parse tcl commands} namespace import [yclprefix]::parse package require {ycl proc} namespace import [yclprefix]::proc::checkargs package require {ycl db sqlite} namespace import [yclprefix]::db::sqlite::util::gen::strquote proc dupopen_sqlite {sqlitedb} { $sqlitedb eval {create table t1()} } proc dupsave_sqlite {dbase chan} { } variable doc::duplicates { description { Find duplicates in a fileset } args { _ {} } } proc duplicates {_ args} { set name [namespace current]::duplicats_[info cmdcount] routine $name {_ args} { checkargs [$_ $ doc::duplicates] {*}$args hi $_ eav find {} matching sha256 eval record { reply $record(e) } } $_ {*}$args } [namespace current] .method duplicates variable doc::enter { description { Introspect a file for any streams it contains. Returns a boolean value indicating whether it entered the file . This can be used, for example, by a monitoring program to decide whether to depost the whole file or just its contents into a repository . } } proc enter {_ path type mime ext} { # {to do} {Currently, password-protected files wait for a password if stdin # is a terminal. Maybe this is ok, or maybe there's room for improvement. # Maybe implement a list of passwords to try whenever a password-protected # file is encountered} namespace upvar $_ handlers handlers # The caller should make sure that the file exists and is readable . It is # an error here if the file can't be read . If the file doesn't exist , # it's a broken symlink that should be captured . if {![file readable $path] && [file exists $path]} { if {![file readable $path]} { error [list {not readable} $path] } } switch [lindex $mime 1] { x-bzip2 { set tempdir [dir mktempdir] if {[catch {lexec::exec | [list tar -xjf $path -C $tempdir] >@stdout 2>@stderr} cres copts]} { lappend cres2 $cres set ftail [file tail $path] regsub {\.bz2?$} $ftail {} ftail if {[catch {lexec::exec | [list bunzip2 -k $path] >$tempdir/[ file tail $path] >@stdout 2>@stderr} cres copts]} { lappend cres2 $cres return -code error [ list {could not enter} type $type path $path \ cres $cres2] } } } x-gzip { set tempdir [dir mktempdir] if {[catch {lexec::exec | [list tar -xzf $path -C $tempdir] >@stdout 2>@stderr} cres copts]} { lappend cres2 $cres set ftail [file tail $path] regsub {\.gz$} $ftail {} ftail if { [catch {lexec::exec | [list gunzip -qc $path] \ >$tempdir/$ftail 2>@stderr} cres copts] } { lappend cres2 $cres return -code error [ list {could not enter} type $type path $path \ errors $cres2] } } } x-xz { set tempdir [dir mktempdir] if {[catch {lexec::exec | [list tar -xJf $path -C $tempdir] >@stdout 2>@stderr} cres copts]} { lappend cres2 $cres set ftail [file tail $path] regsub {\.xz$} $ftail {} ftail if { [catch {lexec::exec | [list xzcat $path] \ > $tempdir/$tail 2>@stderr} cres copts] } { lappend cres2 $cres return -code error [ list {could not enter} type $type path $path \ cres $cres2] } } } rar { set tempdir [dir mktempdir] set oldpwd [pwd] cd $tempdir set status [catch {lexec::exec | [list unrar $path] >@stdout 2>@stderr} cres copts] cd $oldpwd if {$status} { return -code error [ list {could not enter} path $path cres $cres] } } zip { set tempdir [dir mktempdir] set oldpwd [pwd] cd $tempdir # </dev/null doesn't defeat interactive prompts ,so use -P {} as well set status [catch {lexec::exec | [list unzip -P {} -o $path] >@stdout 2>@stderr </dev/null} cres copts] cd $oldpwd if {$status} { return -code error [ list {could not enter} path $path cres $cres] } } default { return {} } } return $tempdir } [namespace current] .method enter variable doc::find { description { {ycl coro call reply} with the name of each file in a fileset. } args { _ {} path { description { The path to begin from } default {} } match { description { An [eav find] search fragment } default {lindex {}} } } } proc find {_ args} { checkargs [$_ $ doc::find] {*}$args if {[info exists path]} { set entity [$_ lookup $path] if {$entity eq {}} { return -code error [list {no such path} $path] } } else { set entity [$_ eav set [$_ $ entity] root] } set name ${_}::[info cmdcount] routine $name {_ entity match} { hi set res [$_ eav find {n} descend {d entity == $entity} {*}$match eval record { reply $record(e) }] } $_ $entity $match } [namespace current] .method find proc fstat {_ path} { namespace upvar $_ attmap attmap ftypemap ftypemap set stats [$_ stat $path] dict for {key val} $stats { if {[dict exists $attmap $key]} { dict unset stats $key dict set stats [dict get $attmap $key] $val } } set type [dict get $stats t] if {[dict exists $ftypemap $type]} { dict set stats t [dict get $ftypemap $type] } return $stats } [namespace current] .method fstat variable doc::init { description { A single directory } args { _ { description { This shelf . } } eav { description { The eav to use . } default {} } magic { description { The magic value that identifies the eav record for this path } default {randprint_256} process {$_ $ magic $magic} } pathcache_max { description { Maximum number of paths to hold in the path cache } default {lindex 25000} process {$_ $ pathcache_max $pathcache_max} } dbfile { description { If an eav is not provided, the name of a file that the automatically-created eav should use. } default {lindex :memory:} } } } variable doc::configure $doc::init proc init {_ args} { checkargs [$_ $ doc::init] {*}$args $_ $ magic [$_ $ magic] if {![info exists eav]} { [yclprefix] eav sqlite eav ${_}::eav fname $dbfile dbinit { PRAGMA journal_mode=OFF ; PRAGMA synchronous=OFF } } $_ .eval { namespace import [yclprefix]::coro::call::hi } $_ .routine eav $_ $ pathcache {} $_ $ attmap [$_ $ attmap] $_ $ numeric [$_ $ numeric] $_ $ ftypemap [$_ $ ftypemap] set entities [$_ eav find {}] set entities [$_ eav find {} == $magic $magic] if {[llength $entities]} { $_ $ entity [lindex $entities end] } else { $_ $ entity [$_ eav set {} $magic $magic] } $_ eav dset [list [$_ $ entity] root] n {} return $_ } [namespace current] .method init proc lookup {_ path} { set entity [$_ eav set [$_ $ entity] root] foreach component $path { set entities [$_ eav find {} == n $component == d [expr $entity]] if {![llength $entities]} { return } else { set entity [lindex $entities end] } } return $entity } [namespace current] .method lookup proc log {_ msg} { puts stderr [list $_ {*}$msg] } [namespace current] .method log variable doc::manifest { description { Produce a manifest of a fileset } args { _ {} chan { description { A channel to write the manifest to. } } include { description { A list of additional fields to include in the manifest . The empty string means all available fields . } default {} process { ::apply [list include { upvar fields fields fields_more fields_more \ fields_standard fields_standard fields_less fields_less \ usr_include usr_include if {$include eq {}} { set fields {} } foreach item $include { if {$item in {less more standard}} { add fields {*}[$ $ fields_$item] } else { add fields $item } } } [namespace current]] $include } count -1 } } } proc manifest {_ args} { namespace upvar $_ fields_more fields_more \ fields_standard fields_standard fields_less fields_less set fields [$_ $ fields_standard] checkargs [$_ $ doc::manifest] {*}$args set i [expr 0] dict set items [$_ eav set [$_ $ entity] root] $i {*}$chan puts [list $i [dict create n {} type v]] set iter [$_ find] while 1 { set item [{*}$iter] dict set items $item [incr i] set info [$_ eav set $item] set res {} if {[dict exists $info d]} { # Directories occur before their contents dict set info d [dict get $items [dict get $info d]] } if {[llength $fields]} { foreach field $fields { if {[dict exists $info $field]} { dict set res $field [dict get $info $field] } } foreach sigtype {sha512 sha256 sha1 md5} { if {[dict exists $info $sigtype]} { dict set res $sigtype [dict get $info $sigtype] break } } {*}$chan puts [list $i $res] } else { {*}$chan puts [list $i $info] } } {*}$chan flush } [namespace current] .method manifest variable doc::node { description { Create a new node in the fileset . } } proc node {_ path} { namespace upvar $_ pathcache pathcache namespace upvar $_ pathcache_max pathcache_max # Canonicalize the path set dir [list {*}[lrange $path 0 end-1]] if {[dict exists $pathcache $dir]} { set entity [dict get $pathcache $dir] set path [lrange $path[set path {}] end end] } else { set dir {} set entity [$_ eav set [$_ $ entity] root] } lappend res $entity while {[llength $path]} { set path [lassign $path[set path {}] component] set newentity [$_ eav find {} == n $component == d $entity] if {$newentity < -Inf} { set path [linsert $path[set path {}] 0 $component] break } else { if {[llength $path]} { lappend dir $component if {![dict exists $pathcache $dir]} { dict set pathcache $dir $newentity } } lappend res $newentity set entity $newentity } } while {[llength $path]} { set path [lassign $path[set path {}] component] # Leave the "t" (type) attribute undefined for synthetic nodes # because there are nodes other than directories , such as # archives, that have child nodes . The caller can decide the # disposition of the "t" attribute . [$_ set] detects synthetic # nodes by the absence of "t" by the absence of "t" by the # absence of "t" by the absence of "t" set entity [ $_ eav set {} n $component d [expr {$entity + 0}]] if {[llength $path]} { lappend dir $res $component if {![dict exists $pathcache $dir]} { dict set pathcache $dir $entity } } lappend res $entity } if {[dict size $pathcache] > $pathcache_max} { set over [expr {[dict size $pathcache] - $pathcache_max}] foreach key [dict keys $pathcache] { dict unset patchcache $key if {[incr over -1] == 0} break } } return $res } [namespace current] .method node proc path {_ entity} { set res {} set name [$_ eav set $entity n] if {$name eq {}} { return {} } $_ eav find n ascend {d entity == $entity} eval record { lappend res $record(v) } lappend res $name return [lrange $res[set res {}] 1 end] } [namespace current] .method path variable doc::read { description { Ingest a manifest . Returns the number of fileset entities created } args { _ {} chan { description { The channel to read from . } } into { description { A list that constitutes a path that that the scanned fileset should be placed under. } default {lindex {}} } update { description { see the description for [scan] } default {lindex false} } mapshort { description { map short attributes to their long forms } default {lindex true} } } } proc read {_ args} { upvar 0 [$_ $.locate attmap] attmap # Invariant: Directories occur before their contents. checkargs [$_ $ doc::read] {*}$args set nodemap {} set current {} dict for {key val} $attmap { dict set rattmap $val $key } if {$update} { set action update } else { set action set } set iter [parse tcl commands iter chan $chan] try { lassign [{*}$iter] i info } on break {cres copts} { error [list {no header}] } dict set nodemap $i [$_ node {}] while 1 { set item [{*}$iter] lassign $item i info set name [dict get $info n] set parent [dict get $nodemap [dict get $info d]] dict set info d $parent set path [$_ path $parent] lappend path $name set entity [lindex [$_ node $path] end] $_ $action $entity {*}$info dict set nodemap $i $entity } return } [namespace current] .method read proc fsplit {_ path} { if {[file pathtype $path] ne {absolute}} { return -code error [list {path is not absolute}] } set path [file split $path[set path {}]] if {[lindex $path 0] eq {/}} { set path [lreplace $path[set path {}] 0 0] } return $path } [namespace current] .method fsplit variable doc::args { description { scan the contents of the repository . } args { follow { description { Whether and when to follow symlinks . } default {lindex no} validate {$follow in {inside no outside yes}} } } } variable doc::scan { description { scan a directory hierarchy . It isn't the job of this function to resolve a symbolic link or process the contents of the file a symlink points to . $iter can provide both a symbolic link and the file it links to , if desired . A symlink itself , however , is recorded in the fileset . This comman will proceed even if some file information can't be gathered . The caller should verify that the results are satisfactory. For example , the caller may wish to verify that metadata was collected for all files . To do this as the scan progresses , uses $process . } args { _ {} previous { description { The name of a file where information about a previous run was recorded as a dictionary of information about a previous scan, with the file names as keys, and file information as values. For each file in $iter, if the file name is found in $previous, the file information is copied from there rather than being generated from the live file. } default {} process { dict set scanopts previous $previous sqlite3 ${_}::previous $previous $_ .routine previous } } iter { description { A prefix for a {ycl coro call} iterator of absolute paths of the files to record . } } infofailed { description { A Command prefix to execute, applying the name of the file, the failure result, and the options dictionary, when file metatadata can't be read . } default {list ::apply {{fname res opts} {dict incr opts -level; return -options $opts $res}}} process { dict set scanopts infofailed $infofailed lindex $infofailed } } into { description { A list that constitutes a path that that the scanned fileset should be placed under. If the target of a symbolic link is abolute, the path indicated by $into is prepended to it. } default {lindex {}} } record { description { record the scan results in the database } default {lindex true} process { dict set scanopts record $record lindex $record } } trim { description { The number of initial elements to remove from each filename . } default {lindex 0} } preproc { description { A command prefix to which the following items are appended: path } default {list apply {path {return $path}}} process { dict set scanopts preproc $preproc lindex $preproc } } postproc { description { A command prefix to which the the following items are appended: entity path { description { The path of the file in the fileset } } entered { description { A boolean value indicating whether or not a scan into the file was performed . } } fname { The path of the file on the filesystem } info { description { File info. } } containing its attributes are appended . } default {list list} process { dict set scanopts postproc $postproc lindex $postproc } } update { description { Whether to update records that already exist. By default, records can not be updated, which protects from inadvertent data loss, for example, when an emulation layer like Cygwin recognizes difference in case processes files on an NTFS filesystem where differences in case are not recognized. } default {lindex false} process { dict set scanopts update $update lindex $update } } } } proc scan {_ args} { checkargs [$_ $ doc::scan] {*}$args if {$update} { set action update } else { set action set } namespace upvar $_ attmap attmap magic magic dict update attmap link att_link {} set i 0 # Save time here by setting this outside the loop , even though the empty # string is only expected when $record is false set entity {} $_ eav db eval {savepoint one} while 1 { set fname [{*}$iter] diagnostic info processing $fname if {$i > 999} { $_ eav db eval { release one } set i 0 } if {$i == 0} { $_ eav db eval { savepoint one } } # fsplit verifies that $fname is an absolute path set fname [{*}$preproc $fname] set fpath [$_ fsplit $fname] if {$trim} { set fpath [list {*}[lrange $fpath[set fpath {}] $trim end]] } set fpath [list {*}$into {*}$fpath] if {![llength $fpath]} { error [list {encountered root directory} $fname] } set oldperms {} if {![file readable $fname]} { if {[catch { set oldperms [file attributes $fname -permissions]} cres copts]} { diagnostic warning {could not capture permissions for} $fname diagnostic warning [dict get $copts -errorinfo] } } set finfo {} if {[info exists previous]} { lassign [$_ previous eval { select info , entered from files where name == :fpath}] finfo entered if {$entered eq {}} { set entered 0 } } else { set entered 0 } if {![llength $finfo]} { set block { set finfo [$_ fstat $fname] if {[file isfile $fname]} { set finfo [dict merge $finfo[set finfo {}] [$_ signature run $fname]] } } try $block on ok {tres topts} {} on error {tres topts} { try {invade $fname do $block} on error {tres1 topts1} { # Return the first earlier information collected so that # arbitrary errors aren't lost . {*}$infofailed $fname $tres $topts } } } if {[dict exists $finfo $att_link]} { set link [dict get $finfo $att_link] if {[file pathtype $link] eq {absolute}} { #{to do} {add $into to link ?} } } if {[dict exists $finfo T]} { set mime [dict get $finfo T] #{to do} {type mime filetype} set type {} set ext {} } elseif {[file isfile $fname] || [file isdirectory $fname]} { try { lassign [fileutil::magic::filetype $fname] type mime ext } on error {cres copts} { if {![dict exists $finfo $att_link]} { try {invade $fname do { lassign [fileutil::magic::filetype $fname] type mime ext }} on error {cres1 copts1} { # Return the first earlier information collected so that # arbitrary errors aren't lost . return -options $copts $cres } } # Must be a broken link set type {} set mime {} set ext {} } on ok {} { if {[llength $mime]} { dict set finfo T $mime } } } else { set type [dict get $finfo t] set mime {} set ext {} } if {$record} { set entity [lindex [$_ node $fpath] end] # [$_ set] behaves differently if $finfo is empty if {[dict size $finfo]} { set entity [$_ $action $entity {*}$finfo] } } else { # set the name here instead of calling [$_ node] dict set finfo n [lindex $fpath end] } set tempdir {} if {$entered != 2} { set block { set tempdir [$_ enter $fname $type $mime $ext] if {$tempdir eq {}} { set entered 0 } else { set entered 1 } } try $block on error {cres copts} { try {invade $fname do $block} on error {cres copts} { diagnostic warning [list {error on enter} file $fname error $cres opts $copts] set entered 0 } } } {*}$postproc $entity $fpath $entered $fname $finfo if {$tempdir ne {}} { try { if {$record} { set into2 [$_ path $entity] } else { set into2 $fpath } set enteriter [dir contents [list [dir iter $tempdir invade true] next]] puts stderr [list recursively scanning $tempdir $into2] $_ scan iter $enteriter into $into2 {*}$scanopts trim [ expr {[llength [file split $tempdir]] - 1}] {*}$postproc $entity $fpath 2 $fname $finfo } finally { dir destroy $tempdir } } elseif {$entered == 2} { # $entered is only 2 if a scan of the contents was # previously completed . $_ previous eval "select name , info , entered from files where name like [ strquote $fpath\ %]" comprec { if {[lindex $fpath end] ne [lindex $comprec(name) [llength $fpath]-1]} { continue } {*}$postproc $entity $comprec(name) $comprec(entered) \ [join $comprec(name) /] $comprec(info) } } if {[info exists oldperms]} { try {file attributes $fname -permissions $oldperms} on error {cres copts} { puts stderr [list {could not restore original permissions} $cres] } } incr i } # {to do} {Why is there a complaint here that no such savepoint exists?} catch {$_ eav db eval {release savepoint one}} return } [namespace current] .method scan variable doc::set { description { set fields of a node, which must already exist. } } proc set_ {_ entity args} { if {[llength $args] > 1} { set current [$_ update $entity] foreach {key val} $args { if {[dict exists $current $key] && [dict get $current $key] != $val} { if {![dict exists $current t]} { # This is a "pure" directory, possibly created by [node], # and not containing any real file information, just name, # type, and parent directory. continue } error [list {already exists} $entity $key {existing value} [ dict get $current $key] {new value} $val] } } } $_ update $entity {*}$args } [namespace current] .method set set_ proc update {_ entity args} { namespace upvar $_ numeric numeric if {[llength $args]} { if {[llength $args] > 1} { # {To do} {Can [read] be written such that this check isn't # necessary ?} set d [$_ eav set $entity d] if {[dict exists $args d]} { if {[dict get $args d] != $d} { error [list {directory conflict} \ {fileset directory} $d \ {user specified directory} [dict get $args d]] } } dict for {key val} $args { if {$key in $numeric} { if {[string is entier -strict $val]} { set val [expr {$val + 0}] dict set args $key $val } } } $_ eav set $entity {*}$args return $entity } return [$_ eav set $entity {*}$args] } else { set res [$_ eav set $entity] } return $res } [namespace current] .method update update [namespace current] .routine previous [namespace current] .routine eav [namespace current] .routine interp1 [namespace current] .routine stat dir stat run [namespace current] .routine signature [namespace current] $ numeric {a c d g i m p s t u v} [namespace current] $ attmap { atime a ctime c gid g ino i link l mtime m mode p size s type t uid u dev v } [namespace current] $ ftypemap { file f directory d link l } #[namespace current] $ rchan [open |[list [info nameofexecutable] - 2>@stderr] r+] #[rinterp .spawn interp1] init [[namespace current] $ rchan] [namespace current] $ fields_more {a c d g link p m n s t u} [namespace current] $ fields_standard {d link p m n s t} [namespace current] $ fields_less {d link p n s t} ## Assumption:: we are in a coroutine context #[namespace current] interp1 send { # package require lexec #}