#!/bin/bash
# Author: Theodore Zacharia
# V0.1 - 27/09/2023
# V0.2 - 29/09/2023 - Improve name capture and sorting, prevent shorter names from getting priority
# V0.3 - 31/10/2023 - Add -n feature to look for specific (directory) names
#
# Find files which may belong in directories based on the filename compared to a directory of names
# use the existing directories as the list of names to look for, then if the name is als found in the 
# filename, put the file into a script which can be run to move the file into the directory
#
# ToDo: Look into meta data such as EXIF or IPTC tags for details of names
#


# *** Globals
TRACE=0
NAMELIST=$$.namelist.tmp
GNAMES=global_namelist.txt
INCLUDECOMBINED=0
SPECIFICNAME=""
T1=$$.tmp
BASEDIR=.


# *** Functions
_usage()
{
	echo "usage: $0 [-h] [-t] [-c|-C] [-n specificname] basedir searchdir"
	echo "where"
	cat <<- _EOF_
	  -h        Displays this help message
	  -t        Set trace on
	  -c        To include combined firstnamelastname in search
	  -C		To include the excluded endofname chars describing type
	  -n		To use a specific name in the search criteria

	  basedir   the target directory where files are already stored for the longer term
	  searchdir the directory where the files you want to move are currently stored,
	  			a matching destination dir in the basedir will be looked for

	  Examples:

Getting a list, making a specific exclusion then applying the changes

  ./findbyname.sh /media/theodore/sdata23/PnC/Coll /media/theodore/sdata23/PnC >mm 2>mm2

To handle double barrelled or middle names use the -c flag
  ./findbyname.sh -c /media/theodore/sdata23/PnC/Coll /media/theodore/sdata23/PnC >mm 2>mm2

remove path you don't want to modify

  grep -v /media/theodore/sdata23/PnC/Coll/S/Short mm > mm1

Run the created file, which is actually a script.  Note, the mv command is in noclobber mode
so if there already is a file with the same name at the destination the file will NOT be moved
  sh mm1

To handle specific names and double barrelled etc
  ./findbyname.sh -n StarWars -c /media/theodore/sdata23/PnC/Coll /media/theodore/sdata23/PnC >mm 2>mm2

For moving from deep to less deep
  ./findbyname.sh /media/theodore/sdata24/PnC/Coll/ /media/theodore/sdata24/PnC/Coll/vn >mm 2>mm2

Note the ending slashes in the above 2 are different, this is important for scope

Creating dir structure, the structure is automatically added to global_namelist.txt
To create a fully structire, go to the Coll dir and run
  for A in \$(cat global_namelist.txt) ; do mkdir \$A ; done

_EOF_
}

# *** Mainline

# process input parameters
while getopts thcCn: AOPT
do
case $AOPT in
	t) TRACE=1 ;; # set TRACE mode
	c) INCLUDECOMBINED=1 ;;
	C) INCLUDECOMBINED=2 ;;
	n) SPECIFICNAME=$OPTARG ;;
	h) _usage
	   exit 1 ;;
	*) # echo "-$OPTARG is an invalid option" >&2 # start getopts string with :
	   exit 2 ;;
esac
done

shift $((OPTIND-1))

if [ $# -gt 1 ]
then
	BASEDIR=$1
	SEARCHDIR=$2
else
	_usage
	exit 1
fi

if [ ! -n "$SPECIFICNAME" ]
then
	# getting structure
	find $BASEDIR -maxdepth 2 -type d >> $GNAMES
	sed "s:${BASEDIR}/::g" $GNAMES | sort | uniq > $GNAMES.1 ; mv $GNAMES.1 $GNAMES

	# get name list

	# nameset 1, must be under Coll
	find $BASEDIR -maxdepth 2 -type d | xargs -I {} basename "{}" > $NAMELIST
else
	echo "$SPECIFICNAME" > $NAMELIST
fi

# now we need to process the namelist to tidy and split names etc
while read ALINE
do

	# remove special chars which we don't need or want
	BLINE=$(echo $ALINE | sed "s/\TS$//;s/\CD$//;s/\LL$//")

	# ignore if less than 3 chars
	if [ ${#BLINE} -lt 3 ]
	then
		if [ $TRACE -gt 0 ] ; then echo "skipping $BLINE" >&2 ; fi
		continue
	fi

	if [ $TRACE -gt 0 ] ; then echo "processing 1.$ALINE" >&2 ; fi
	if [ $TRACE -gt 0 ] ; then echo "processing 2.$BLINE" >&2 ; fi
	# now put a space if name is in multiple parts (identified by upper lower case switch)

	# Input string containing concatenated first name and last name OR already split

	case "$BLINE" in  
	*\ * )
		echo "$BLINE" >> $NAMELIST.1
		;;
	*)

		# Find the position of the last uppercase letter (assumes last name starts with an uppercase letter)
		last_name_start=$(echo "$BLINE" | grep -o '[A-Z][^A-Z]*$')

		if [ -n "$last_name_start" ]; then
			# Extract the first name and last name
			first_name="${BLINE%"$last_name_start"}"
			last_name="$last_name_start"

			# Capitalize the first letter of the first name and last name
			first_name="${first_name^}"
			last_name="${last_name^}"

			# some names have middle bigs capitalised so to handle this do following
			middle_name=$(echo "$first_name" | grep -o '[A-Z][^A-Z]*$')
			first_name2="${first_name%"$middle_name"}"

			# Output the results
			if [ $TRACE -gt 0 ]
			then
				echo "First Name: $first_name" >&2
				echo "Last Name: $last_name" >&2
				echo "Middle Name: $middle_name" >&2
				echo "First Name2: $first_name2" >&2
			fi
		else
			echo "$BLINE: Unable to determine last name. Invalid format." >&2
			continue
		fi

		if [ $TRACE -gt 0 ] ; then echo "$first_name $last_name" >&2 ; fi

		if [ ! -n "$first_name" ]
		then
			echo "$last_name" >> $NAMELIST.1
		else
			# put variations of possible names to look for in the namelist
			echo "$first_name $last_name" >> $NAMELIST.1
			echo "${first_name}-$last_name" >> $NAMELIST.1
			echo "${first_name}_$last_name" >> $NAMELIST.1
			if [ $INCLUDECOMBINED -gt 0 ]
			then
				echo "${first_name}$last_name" >> $NAMELIST.1
				echo "${first_name2} ${middle_name} $last_name" >> $NAMELIST.1
				echo "${first_name2} ${middle_name}$last_name" >> $NAMELIST.1
				echo "${first_name2}-${middle_name}-$last_name" >> $NAMELIST.1
				echo "${first_name2}-${middle_name}$last_name" >> $NAMELIST.1
				echo "${first_name2}_${middle_name}_$last_name" >> $NAMELIST.1
				echo "${first_name2}_${middle_name}$last_name" >> $NAMELIST.1
			fi
		fi

		;;
	esac


done<$NAMELIST

mv $NAMELIST.1 $NAMELIST

echo "----------------------------------" >&2
cat $NAMELIST >&2
echo "----------------------------------" >&2

# now we have the names, let's find files matching

# first put longest names at beginning of list as these will be the best match

# sort into largest to smallest
# cat $NAMELIST |  awk '{ print length, $0 }'| sort -r -n -s | cut -d" " -f2- > $NAMELIST.2

# as the above used awk, here is an awk oneliner with no other tools used, not really readable unless you know awk 
awk '{ c = length ; m[c] = m[c] ? m[c] RS $0 : $0 } END { for (c in m) q[++x] = m[c] ; while (x) print q[x--] }' $NAMELIST > $NAMELIST.2

# a perl version
# cat $NAMELIST | perl -e 'print sort { length($b) <=> length($a) } <>'

# and a python version
# cat $NAMELIST | python -c 'import sys; sys.stdout.writelines(sorted(sys.stdin.readlines(), key=len, reverse=True))' 

# The result of doing by longest first is the mv list below will sometimes have duplicates where the first hit gets sent to the longest
# target dir, which is what we want, the shorter will be later and will fail, which is expected.

# NOTE:  Always check the output mv file before you execute it

while read ALINE
do
	echo "Looking for $ALINE in $SEARCHDIR (not in $BASEDIR)" >&2
	find $SEARCHDIR -type f -iname "*${ALINE}*" -not -path "$BASEDIR/*" > $T1
	if [ $INCLUDECOMBINED -gt 1 ]
	then
		find $SEARCHDIR -type f -iname "*${ALINE}*TS" -not -path "$BASEDIR/*" >> $T1
		find $SEARCHDIR -type f -iname "*${ALINE}*LL" -not -path "$BASEDIR/*" >> $T1
	fi

	while read BLINE
	do
		# target is one without the space (and dash)
		TARGET=$(echo "$ALINE" | cut -c1)"/"$(echo "$ALINE" | tr -d ' _\-')
		if [ -d $BASEDIR/$TARGET ]
		then
			echo "mv -n \"$BLINE\" \"$BASEDIR/$TARGET\""
		elif [ -d $BASEDIR/${TARGET}TS ]
		then
			echo "mv -n \"$BLINE\" \"$BASEDIR/${TARGET}TS\""
		elif [ -d $BASEDIR/${TARGET}LL ]
		then
			echo "mv -n \"$BLINE\" \"$BASEDIR/${TARGET}LL\""
		else
			echo "No Match for $BASEDIR/$TARGET" >&2
		fi

	done<$T1

done<$NAMELIST.2

if [ $TRACE -eq 0 ]
then
	rm -f $NAMELIST $NAMELIST.2 $T1
fi