#! /usr/bin/env bash
## Public Domain, written by Sebastian Pipping <webmaster@hartwork.org>
## v0.3, 2008-01-08
######################################################

echo 'Set ${KUNDE}/${PLZ}/${OUTPUT_DIR} inside the script and delete this line after.'; exit 0
KUNDE=xxxxxx
PLZ=xxxxx
OUTPUT_DIR=./xxxxxxxxxxxxxxxxx

KUNDENROOT=/kunden/${KUNDE}_${PLZ}

CATCHALL_HOST=CATCHALL

INPUT_DIR=$KUNDENROOT/logs
BACKUP_DIR=$KUNDENROOT/logs_backup
SCRIPT_DIR="$PWD/"`dirname $0`
echo "Script dir = ${SCRIPT_DIR}"

TODAY=`date +%Y-%m-%d`
echo "Today = $TODAY"
echo

mkdir -p ${BACKUP_DIR} 2>/dev/null
mkdir -p ${OUTPUT_DIR} 2>/dev/null

for i in ${INPUT_DIR}/* ; do
	SHORTNAME=${i##*/} ## basename $i
	if [[ "$SHORTNAME" == $KUNDE-* ]]; then
		echo "SKIP $SHORTNAME // quota user"
		continue
	fi

	if [[ -e ${BACKUP_DIR}/$SHORTNAME ]]; then
		## Already copied and therefore analyzed as well
		echo "SKIP $SHORTNAME // old"
		continue
	fi

	DATE=`echo $SHORTNAME | grep -o -E '[0-9]{4}-[0-9]{2}-[0-9]{2}'`
        if [[ "$DATE" != ????-??-?? ]]; then
                echo "ERROR: Could not extract date from filename $SHORTNAME"
                continue
        fi

	ACCESS_DATE=`ls --full-time "$i" | cut -d' ' -f 6`
        if [[ "${ACCESS_DATE}" != ????-??-?? ]]; then
                echo "ERROR: Could not get access date of file $SHORTNAME"
                continue
        fi
	if [[ "${ACCESS_DATE}" == "$DATE" ]]; then
		echo "SKIP $SHORTNAME // immature"
		continue
	fi

	echo "COPY $SHORTNAME"
	cp "${INPUT_DIR}/$SHORTNAME" "${BACKUP_DIR}/"
	mkdir ${BACKUP_DIR}/${SHORTNAME}_hosts 2>/dev/null

	echo "SPLIT $SHORTNAME"
	SPLIT_LINES=0
        exec 3< $i
        while read <&3 LINE ; do
                HOST=`echo $LINE | cut -d' ' -f 1`
                if [[ "$HOST" == '*' ]]; then
                        ## TODO Check where this comes from
                        ## continue
			HOST=${CATCHALL_HOST}
                fi

		if echo "$HOST" | grep -E "[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+" >/dev/null ; then
			## Skip IP, we expect subdomains
			## continue
			HOST=${CATCHALL_HOST}
		fi

		mkdir -p "${OUTPUT_DIR}/$HOST/unprocessed" 2>/dev/null

		if [[ ! -e "${BACKUP_DIR}/${SHORTNAME}_hosts/$HOST" ]]; then
			## Add host to set
			touch "${BACKUP_DIR}/${SHORTNAME}_hosts/$HOST"

			## Truncate/create file
        	        echo $LINE > ${OUTPUT_DIR}/$HOST/unprocessed/$HOST-$DATE
		else
			## Append to file
	                echo $LINE >> ${OUTPUT_DIR}/$HOST/unprocessed/$HOST-$DATE
		fi
		SPLIT_LINES=$((SPLIT_LINES + 1))
        done

	## Line stats
	WC=`wc -l $i`
	echo "  ${SPLIT_LINES} of ${WC/ */} lines copied"

	ANALYZED_LINES=0
	for j in "${BACKUP_DIR}/${SHORTNAME}_hosts/"* ; do
		HOST=${j##*/} ## basename "$j"
		FILE="${OUTPUT_DIR}/$HOST/unprocessed/$HOST-$DATE"
		mkdir "${OUTPUT_DIR}/$HOST/analysis" 2>/dev/null
		ln -s "${SCRIPT_DIR}/flags" "${OUTPUT_DIR}/$HOST/analysis/flags" 2>/dev/null
		WC=`wc -l "$FILE"`
		LC=${WC/ */}
		ANALYZED_LINES=$((ANALYZED_LINES + LC))
	        echo "ANALYZE $HOST-$DATE // $LC lines"
	        LD_LIBRARY_PATH=${SCRIPT_DIR}/geoip-1.3.17.orig/libGeoIP/.libs/ \
	                ${SCRIPT_DIR}/webalizer-2.01-10/webalizer \
			-p \
			-o "${OUTPUT_DIR}/$HOST/analysis" \
			-W ${SCRIPT_DIR}/GeoIP.dat \
        	        -n "$HOST" \
			-Q \
			-R 30 \
			-U 30 \
			-e 30 \
			-E 30 \
			-X \
			"$FILE"
		rm "$j"

		## Move to logs folder, cleanup
		mkdir -p "${OUTPUT_DIR}/$HOST/logs" 2>/dev/null
		mv "$FILE" "${OUTPUT_DIR}/$HOST/logs/"
		rmdir "${OUTPUT_DIR}/$HOST/unprocessed"
	done

	## Line stats
	echo "  ${ANALYZED_LINES} of ${SPLIT_LINES} lines analyzed"

	## Clear host set
	rm -R -f "${BACKUP_DIR}/${SHORTNAME}_hosts"
done
