ycl: Artifact [d724d9a3d4]

Artifact d724d9a3d4df42cb499130350d553bb677b24a16:

File packages/dir/lib/fileset_methods.tcl — part of check-in [4b78d37482] at 2014-11-17 16:35:45 on branch trunk — add fileset (user: pooryorick size: 6814)
#! /bin/env tclsh

variable doc::load {
	args {
		chan { 
			description {
				a channel opened for reading.  Data from the
				scanned files will be stored/updated in it
			}
		}
	}
}

variable load {{args} {dupnames matrixcount} {dbase} {
	checkargs doc::load
	set id [incr matrixcount]
	if {$chan eq {}} {
		return {}
	}
	seek $chan 0
	set dbase [string trim [read $chan]]
	if {$dbase eq {}} {
		matrix matrix$id
		matrix$id add columns 14
	} else {
		#puts [list deserialize database]
		matrix matrix$id
		matrix$id deserialize $dbase[set dbase {}]
		#puts [list deserialize done]
	}
	set rows [matrix$id rows]
	#looping backwards is much faster when deleting matrix rows, because matrix
	#delete internally uses lreplace, which causes all later items to slide
	#forward in the list
	for {set i [expr {$rows-1}]} {$i >= 0} {incr i -1} {
		set row [matrix$id get row $i]
		#this doesn't save any space, because internally, Tcl shares the
		#objects anyway
		#matrix$id delete row $i
		lassign $row {*}$dupnames 
		set key $size,$dev,$ino,$mtime,$ctime
		foreach name $dupnames {
			dict set dbase $key $name [set $name]
		}
	}
	#puts [list dupopen done]
	matrix$id destroy
	return $dbase
}}

variable doc::save {}
variable save {{chan} {matrixcount} {dbase} {
	set id [incr matrixcount]
	matrix matrix$id
	matrix$id add columns 15
	dict for {key val} $dbase {
		matrix$id add row {*}[dict values $val]
		dict unset dbase $key
	}
	puts $chan [matrix$id serialize]
	matrix$id destroy
}}

variable doc::check {
	description {} {
		check to see whether $fpath is a duplicate of any file in the fileset
	}
	value {
		either the signature of the duplicated file or {}, if $fpath is not a
		duplicate
	}
	args {
		dbasename {
			positional

			dataset to check against
		}
		fpath {
			positional

			path of candidate duplicate
		}
	}
}

variable check {{fpath} {} {dbase masters inseen} {
	file stat $fpath fstats_array
	set fstats [array get fstats_array]
	dict with fstats {}
	set duplicate {} 

	#todo: make sure this mtime ctime shortcut is bulletproof 
	#it should be, since its only used to for the current scan of the candidate
	#duplicates

	#this key corresponds with the key in [dupscan]
	set key $size,$dev,$ino,$mtime,$ctime
	if {[dict exists $inseen $key]} {
		set sig [dict get $inseen $key sig]
	} else {
		#caller should only provide existing files
		#it is an error for file not to exist
		if {[catch {set sig [$_ sha digest ${fpath}]} eres eopts]} {
			if {[dict get $eopts -errorcode] eq {NONE}} {
				return -options $eopts $eres
			}
			return {}
		} else {
			dict set inseen $key sig $sig
		}
	}
	#puts [list in signature $fpath $sig]
	if {[dict exists $masters $sig]} {
		set mkey [dict get $masters $sig id]
		if {$mkey eq $key} {
			#this is the master. Don't delete it
			return {}
		}
		set path [dict get $masters $sig path]
		#assume it exists.  Rationale:  it was put in the list of masters during this run
		set duplicate $sig
	}
	if {$duplicate eq {}} {
		#make sure master file still exists before declaring a duplicate
		#-1 means unknown
		set stillexists -1
		set candidates [dict keys $dbase $size,*]
		set csearched 0
		foreach candidate $candidates {
			incr csearched
			if {$candidate eq $key} {
				if {![dict exists $masters $sig]} {
					dict set masters $sig id $candidate
					dict set masters $sig path [dict get $dbase $candidate path]
					#this is the master, not a duplicate
					break
				}
				#it's the same identity as the potential duplicate.
				#don't compare it against itself
				continue
			}
			if {[dict get $dbase $candidate sig] eq {}} {
				#puts [list generate of signature [
				#	dict get $dbase $candidate path]]
				set path [dict get $dbase $candidate path]
				if {[file exists $path]} {
					if {[catch  {
						set csig [$_ sha digest [dict get $dbase $candidate path]]
					} eres eopts]} {
						set csig {}
					} else {
						set stillexists 1
						dict set dbase $candidate sig $csig
					}
				} else {
					set csig {}
				}
			} else {
				set csig [dict get $dbase $candidate sig]
			}
			if {$sig eq $csig} {
				set path [dict get $dbase $candidate path]
				if {$stillexists < 0} {
					set stillexists [file exists $path]
				}
				if {$stillexists == 1} {
					set duplicate $sig
					dict set masters $sig id $candidate
					dict set masters $sig path [dict get $dbase $candidate path]
					break
				}
			}
		}
		#puts [list of searched count $csearched]
	}
	return $duplicate
}}

#todo: work this in somewhere (for merge)
#        set tail [file tail $fname]
#        set destpath [file join $dest $tail]
#        if {[file exists $destpath]} {
#            error msg [list {destpath already exists:} $destpath]
#        }
#        msg "copying $fname to $destpath"
#        file copy $fname $destpath

variable doc::duplicates {
	description {
		find duplicated file content

		[$_ scan] should be run, first.

		uses [$_ check] to determine whether a file is a duplicate


	}
	args {
		in {
			description {
				iterator of paths of potential duplicate files
			}
		}
		format {
			description {
				a list specifying what the output format should be

				each item in the list specifies the information to be placed at
				the same inde in the return value.
			}
			default {
				lindex path
			}
			validate {[
				all $format in {path sig record} 
			]}
		}
	}
	value {
		an iterator of duplicates
	}
}
variable duplicates {args {} {dbase masters of sha} {
	checkargs doc::duplicates
	yield [lindex [info level 0] 0]
	ycl iter for fpath in $in {
		set fpath [::fileutil::fullnormalize $fpath]
		if {![file exists $fpath] || (![file isfile $fpath] && ![
			file isdirectory $fpath])} {
			#broken symbolic link (in ![file exists] case]
			#or some other special kind of file
			continue
		}
		set duplicate [$_ check $fpath]
		if {$duplicate ne {}} {
			set res {}
			foreach item $format {
				switch $item {
					path {
						lappend res $fpath
					}
					sig {
						lappend res $duplicate
					}
					record {
						lappend res [
							dict get $dbase [dict get $masters $duplicate id]]
					}
				}
			}
			if {[llength $res] == 1} {
				set res [lindex $res 0]
			}
			yield $res
		}
	}
}}

variable doc::scan {
	description {
		scans the files in the fileset
	}
	args {
	}
}
variable scan {{} {} {dbase of} {
	checkargs doc::scan
	ycl iter for fname in $of {
		#puts [list gather $fname]
		#if {![file exists $fname]} {
		#	#broken symbolic link
		#	continue
		#}
		#save an extra system call by just attemping to stat the file
		if {[catch {file stat $fname fstats_array}]} {
			continue
		}
		set fstats [array get fstats_array]
		dict with fstats {}
		set key $size,$dev,$ino,$mtime,$ctime
		dict set dbase $key $fstats 
		dict set dbase $key path $fname
		dict set dbase $key sig {} 
	}
	return $dbase
}}