#! /bin/env tclsh
variable doc::load {
args {
chan {
description {
a channel opened for reading. Data from the
scanned files will be stored/updated in it
}
}
}
}
variable load {{args} {dupnames matrixcount} {dbase} {
checkargs doc::load
set id [incr matrixcount]
if {$chan eq {}} {
return {}
}
seek $chan 0
set dbase [string trim [read $chan]]
if {$dbase eq {}} {
matrix matrix$id
matrix$id add columns 14
} else {
#puts [list deserialize database]
matrix matrix$id
matrix$id deserialize $dbase[set dbase {}]
#puts [list deserialize done]
}
set rows [matrix$id rows]
#looping backwards is much faster when deleting matrix rows, because matrix
#delete internally uses lreplace, which causes all later items to slide
#forward in the list
for {set i [expr {$rows-1}]} {$i >= 0} {incr i -1} {
set row [matrix$id get row $i]
#this doesn't save any space, because internally, Tcl shares the
#objects anyway
#matrix$id delete row $i
lassign $row {*}$dupnames
set key $size,$dev,$ino,$mtime,$ctime
foreach name $dupnames {
dict set dbase $key $name [set $name]
}
}
#puts [list dupopen done]
matrix$id destroy
return $dbase
}}
variable doc::save {}
variable save {{chan} {matrixcount} {dbase} {
set id [incr matrixcount]
matrix matrix$id
matrix$id add columns 15
dict for {key val} $dbase {
matrix$id add row {*}[dict values $val]
dict unset dbase $key
}
puts $chan [matrix$id serialize]
matrix$id destroy
}}
variable doc::check {
description {} {
check to see whether $fpath is a duplicate of any file in the fileset
}
value {
either the signature of the duplicated file or {}, if $fpath is not a
duplicate
}
args {
dbasename {
positional
dataset to check against
}
fpath {
positional
path of candidate duplicate
}
}
}
variable check {{fpath} {} {dbase masters inseen} {
file stat $fpath fstats_array
set fstats [array get fstats_array]
dict with fstats {}
set duplicate {}
#todo: make sure this mtime ctime shortcut is bulletproof
#it should be, since its only used to for the current scan of the candidate
#duplicates
#this key corresponds with the key in [dupscan]
set key $size,$dev,$ino,$mtime,$ctime
if {[dict exists $inseen $key]} {
set sig [dict get $inseen $key sig]
} else {
#caller should only provide existing files
#it is an error for file not to exist
if {[catch {set sig [$_ sha digest ${fpath}]} eres eopts]} {
if {[dict get $eopts -errorcode] eq {NONE}} {
return -options $eopts $eres
}
return {}
} else {
dict set inseen $key sig $sig
}
}
#puts [list in signature $fpath $sig]
if {[dict exists $masters $sig]} {
set mkey [dict get $masters $sig id]
if {$mkey eq $key} {
#this is the master. Don't delete it
return {}
}
set path [dict get $masters $sig path]
#assume it exists. Rationale: it was put in the list of masters during this run
set duplicate $sig
}
if {$duplicate eq {}} {
#make sure master file still exists before declaring a duplicate
#-1 means unknown
set stillexists -1
set candidates [dict keys $dbase $size,*]
set csearched 0
foreach candidate $candidates {
incr csearched
if {$candidate eq $key} {
if {![dict exists $masters $sig]} {
dict set masters $sig id $candidate
dict set masters $sig path [dict get $dbase $candidate path]
#this is the master, not a duplicate
break
}
#it's the same identity as the potential duplicate.
#don't compare it against itself
continue
}
if {[dict get $dbase $candidate sig] eq {}} {
#puts [list generate of signature [
# dict get $dbase $candidate path]]
set path [dict get $dbase $candidate path]
if {[file exists $path]} {
if {[catch {
set csig [$_ sha digest [dict get $dbase $candidate path]]
} eres eopts]} {
set csig {}
} else {
set stillexists 1
dict set dbase $candidate sig $csig
}
} else {
set csig {}
}
} else {
set csig [dict get $dbase $candidate sig]
}
if {$sig eq $csig} {
set path [dict get $dbase $candidate path]
if {$stillexists < 0} {
set stillexists [file exists $path]
}
if {$stillexists == 1} {
set duplicate $sig
dict set masters $sig id $candidate
dict set masters $sig path [dict get $dbase $candidate path]
break
}
}
}
#puts [list of searched count $csearched]
}
return $duplicate
}}
#todo: work this in somewhere (for merge)
# set tail [file tail $fname]
# set destpath [file join $dest $tail]
# if {[file exists $destpath]} {
# error msg [list {destpath already exists:} $destpath]
# }
# msg "copying $fname to $destpath"
# file copy $fname $destpath
variable doc::duplicates {
description {
find duplicated file content
[$_ scan] should be run, first.
uses [$_ check] to determine whether a file is a duplicate
}
args {
in {
description {
iterator of paths of potential duplicate files
}
}
format {
description {
a list specifying what the output format should be
each item in the list specifies the information to be placed at
the same inde in the return value.
}
default {
lindex path
}
validate {[
all $format in {path sig record}
]}
}
}
value {
an iterator of duplicates
}
}
variable duplicates {args {} {dbase masters of sha} {
checkargs doc::duplicates
yield [lindex [info level 0] 0]
ycl iter for fpath in $in {
set fpath [::fileutil::fullnormalize $fpath]
if {![file exists $fpath] || (![file isfile $fpath] && ![
file isdirectory $fpath])} {
#broken symbolic link (in ![file exists] case]
#or some other special kind of file
continue
}
set duplicate [$_ check $fpath]
if {$duplicate ne {}} {
set res {}
foreach item $format {
switch $item {
path {
lappend res $fpath
}
sig {
lappend res $duplicate
}
record {
lappend res [
dict get $dbase [dict get $masters $duplicate id]]
}
}
}
if {[llength $res] == 1} {
set res [lindex $res 0]
}
yield $res
}
}
}}
variable doc::scan {
description {
scans the files in the fileset
}
args {
}
}
variable scan {{} {} {dbase of} {
checkargs doc::scan
ycl iter for fname in $of {
#puts [list gather $fname]
#if {![file exists $fname]} {
# #broken symbolic link
# continue
#}
#save an extra system call by just attemping to stat the file
if {[catch {file stat $fname fstats_array}]} {
continue
}
set fstats [array get fstats_array]
dict with fstats {}
set key $size,$dev,$ino,$mtime,$ctime
dict set dbase $key $fstats
dict set dbase $key path $fname
dict set dbase $key sig {}
}
return $dbase
}}