random bash scripts
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

131 lines
3.2 KiB

  1. #!/bin/bash
  2. echo 'search for multiple files with the same content'
  3. echo
  4. shopt -s extglob
  5. directory=()
  6. sizelimit='0'
  7. algorithm='md5'
  8. missing=''
  9. while (($#)); do
  10. case "$1" in
  11. '-s')
  12. shift
  13. sizelimit="$1"
  14. ;;
  15. '--size='*) sizelimit="${1#*=}" ;;
  16. '-a')
  17. shift
  18. algorithm="$1"
  19. ;;
  20. '--algo='*) algorithm="${1#*=}" ;;
  21. '--help' | '-h' | '-?')
  22. cat <<EOT
  23. Syntax :
  24. duplicate.sh [-s size] [-a algorithm] [directory [...]]
  25. Parameters :
  26. -s size | --size=size - file size in bytes, smallers are not checked ( 0 )
  27. -a algorithm | --algo=algorithm - checksum algorithm : MD5 or SHA1 ( MD5 )
  28. directory - directory with path to include in the search ( . )
  29. All searches are recursive.
  30. EOT
  31. exit
  32. ;;
  33. *)
  34. [[ -e "$1" ]] || missing="$missing '$1',"
  35. directory[${#directory[@]}]="$1"
  36. ;;
  37. esac
  38. shift
  39. done
  40. algorithm="$(tr '[:upper:]' '[:lower:]' <<<"$algorithm")"
  41. summer="${algorithm}sum"
  42. [[ "$missing" ]] && {
  43. echo "error : can not find the following directories :$missing"
  44. exit 1
  45. }
  46. [[ "$sizelimit" != +(0|1|2|3|4|5|6|7|8|9) ]] && {
  47. echo "error : can not use invalid size limit : $sizelimit"
  48. exit 1
  49. }
  50. [[ "$algorithm" != @(md5|sha1) ]] && {
  51. echo "error : can not use unknown algorithm : $algorithm"
  52. exit 1
  53. }
  54. type "$summer" >/dev/null 2>&1 || {
  55. echo "error : can not find the tool for checksum : $summer"
  56. exit 2
  57. }
  58. echo -n 'creating temporary directory... '
  59. work="$(mktemp -d -p '/tmp/' 'dupXXXXXX')"
  60. echo "Ok ( ${work##*/} )"
  61. echo -n 'creating file list... '
  62. find "${directory[@]}" ! -path "$work/*" -type f -printf '%s*\t%p\n' >"$work/1" 2>"$work/1e"
  63. echo "Ok ( $(wc -l <"$work/1"), error $(wc -l <"$work/1e") )"
  64. echo -n 'discarding small files... '
  65. dist=''
  66. if sizelimit; then
  67. awk -vs="$sizelimit" '$1+0<s' "$work/1" >"$work/1s"
  68. dist='s'
  69. echo "Ok ( $(wc -l <"$work/1s") )"
  70. else
  71. echo 'not needed'
  72. fi
  73. echo -n 'searching for duplicated file sizes... '
  74. sort -n "$work/1$dist" | cut -d $'\t' -f 1 | uniq -d >"$work/2"
  75. echo "Ok ( $(wc -l <"$work/2") )"
  76. echo -n 'creating list of potential duplicated files... '
  77. grep -w -F -f "$work/2" "$work/1" | cut -d $'\t' -f 2- >"$work/3"
  78. echo "Ok ( $(wc -l <"$work/3") )"
  79. echo -n 'collecting checksums... '
  80. tr '\n' '\0' <"$work/3" | xargs -0 "$summer" >"$work/4" 2>"$work/4e"
  81. echo "Ok ( $(wc -l <"$work/4"), error $(wc -l <"$work/4e") )"
  82. echo -n 'searching for duplicated checksums... '
  83. sort -n "$work/4" | cut -d ' ' -f 1 | uniq -d >"$work/5"
  84. echo "Ok ( $(wc -l <"$work/5") )"
  85. echo -n 'preparing the checksum list for fast search... '
  86. dist=''
  87. read -r line <"$work/4"
  88. if "$line" == *\**; then
  89. echo 'not needed'
  90. else
  91. sed 's/ / */' "$work/4" >"$work/4s"
  92. dist='s'
  93. echo 'Ok'
  94. fi
  95. echo -n 'creating list of duplicated files... '
  96. sed 's/$/ */' "$work/5" | grep -F -f - "$work/4$dist" | sort >"$work/6"
  97. echo "Ok ( $(wc -l <"$work/6") )"
  98. echo -n 'creating result list... '
  99. awk -F' \\*' -vOFS='' 'l!=$1{print""}{l=$1;$1=""}1' "$work/6" >"duplicated.txt"
  100. echo "Ok ( duplicated.txt )"
  101. echo -n 'cleaning up temporary data... '
  102. size="$(du -h "$work" | cut -d $'\t' -f 1)"
  103. #rm -r -f "$work"
  104. echo "Ok ( $size )"
  105. echo -n 'all done in'
  106. sec=$SECONDS
  107. ((sec/60/60)) && echo -n " $((sec/60/60)) hours"
  108. ((sec/60 % 60)) && echo -n " $((sec/60 % 60)) minutes"
  109. ((sec % 60)) && echo -n " $((sec % 60)) seconds"
  110. ((sec)) || echo -n ' no time'
  111. echo '.'
  112. cat duplicate.txt