#! /usr/bin/env python
#
# treeify -- Turn a list of file names into a tree
#
# Copyright (C) 2011 Sebastian Pipping <sebastian@pipping.org>
# Licensed under GPL v3 or later
#
# EXAMPLE #1
# ==========
# Turn a plain list into a tree.
#
#   $ treeify <<EOF
#   /usr/bin/env
#   /usr/bin/qemu-kvm
#   /bin/bash
#   EOF
#
# OUTPUT:
#   /
#     /bin
#       /bash
#     /usr
#       /bin
#         /env
#         /qemu-kvm
#
#
# EXAMPLE #2
# ==========
# Collect data along with te path.
#
#   $ treeify '^(?P<path>.+) (?P<size_sum>[0-9]+) (?P<type_set>[a-z]+)$' <<EOF
#   /usr/bin/env 22792 cli
#   /usr/bin/okular 57264 gui
#   /bin/bash 844088 cli
#   EOF
#
# OUTPUT:
#   924,144  cli|gui  /            
#   844,088  cli        /bin       
#   844,088  cli          /bash    
#    80,056  cli|gui    /usr       
#    80,056  cli|gui      /bin     
#    22,792  cli            /env   
#    57,264  gui            /okular
#
# Field suffix "_sum" and "_set" tell treeify to sum up integeers or to collect
# values in a set.  Without such suffix a field will only be shown where actually
# set (i.e. for leafs) and not be propagated towards the root.
#
# The output format for this example was derived as '%(size_sum)s  %(type_set)s  %(path)s',
# but we could have specified a custom format as the second command line argument.

from __future__ import print_function
import sys
import os
import re
try:
	import argparse
except ImportError:
	print("ERROR: You need Python 2.7+ unless you have module argparse "
		"(package dev-python/argparse on Gentoo) installed independently.", file=sys.stderr)
	sys.exit(1)

_INDENT = '  '

_DEFAULT_PATTERN = '^(?P<path>.+)$'

_ignore_basenames = (
	'.git',
	'.svn',
	'CVS',
)


def indent(level):
	return _INDENT * level


class Node:
	def __init__(self):
		self.children = dict()
		self.props = dict()

	def _repr_helper(self, path, level=0):
		lines = list()
		lines.append("%s%s %s" % ('  '*level, path, ','.join(str(e) for e in sorted(self.props.items()))))
		for k, v in sorted(self.children.items()):
			child_lines = v._repr_helper("/" + k, level+1)
			lines += child_lines
		return lines

	def __repr__(self):
		lines = self._repr_helper("/")
		return '\n'.join(lines)


def ignore(basename):
	return basename in _ignore_basenames


def colect_props(target, groupdict):
	for k, v in groupdict.items():
		if k == 'path':
			continue
		if v is None:
			continue

		if k not in target:
			target[k] = list()
		target[k].append(v)


def format_groups(integer):
	"""
	Turns an int into a number string with commas,
	e.g. 1234567 becomes "1,234,567".
	"""
	s = str(integer)
	l = len(s)
	a = []
	for i, v in enumerate(s):
		a.append(v)
		if ((l - i - 1) % 3 == 0) and (i != l - 1):
			a.append(',')
	return ''.join(a)


def output_string(value):
	if type(value).__name__ == 'set':
		return '|'.join(sorted(output_string(e) for e in value))
	if type(value).__name__ == 'list':
		return ';'.join(sorted(output_string(e) for e in value))
	elif type(value).__name__ == 'int':
		return format_groups(value)
	elif type(value).__name__ == 'float':
		return '%.3f' % value
	else:
		return str(value)


def entry(path, props, format, max_length, level):
	def align(key, value, max_length):
		if type(value).__name__ in ('int', 'float'):
			sign = '' # align right
		else:
			sign = '-' # align left
		return ('%%%s%ds' % (sign, max_length[key])) % output_string(value)

	# Make final values to fill format string
	d = dict()
	for k, v in props.items():
		if k == 'path':
			continue
		d[k] = align(k, v, max_length)
	d['path'] = align('path', indent(level) + path, max_length)

	# Fill and print format string
	try:
		s = format % d
	except KeyError as e:
		# Fill missing keys with '' and try again
		# Needed to support '[..]' ignore entries
		for x in re.finditer('%\\(([^)]+)\\)', format):
			key = x.group(1)
			if key in d:
				continue
			d[key] = align(key, '', max_length)
		s = format % d
	print(s)


def dump(tree, format, max_length, max_level, level=0):

	def handle_node(tree, label, k, max_level, level):
		entry(label, tree.children[k].props, format, max_length, level)
		if tree.children[k].children:
			if ignore(k) or (level >= max_level):
				entry("[..]", dict(), format, max_length, level + 1)
			else:
				dump(tree.children[k], format, max_length, max_level, level=level + 1)

	if level == 0:
		for key, subst in (("", "/"), ("..", ".."), (".", ".")):
			if key not in tree.children:
				continue
			handle_node(tree, subst, key, max_level, level)
	else:
		for k, v in sorted(tree.children.items()):
			handle_node(tree, '/' + k, k, max_level, level)

_mult_slash_start = re.compile('^//+')

def simplify(path):
	# See bug "os.path.normpath leading '//'"
	# http://bugs.python.org/issue636648
	res = re.sub(_mult_slash_start, "/", os.path.normpath(path))
	if not (res.startswith("/") or res.startswith("../")):
		res = './' + res
	return res


def cast(value, target_type):
	if value is None:
		return None

	if target_type == 'int':
		return int(value)
	elif target_type == 'float':
		return float(value)
	return value


def cast_iterable(iterable, target_type):
	return (cast(e, target_type) for e in iterable)


def cast_and_reduce(path, tree, type_map, level=0):
	"""
	This funciton is used to shovel data up the tree
	The suffix of a key's name determines, what to do with the data:
	 - "_sum" adds up, neutral element zero
	 - "_set" makes unions on sets, neurtal element is the empty set
	"""

	# Cast and reduce ourselves
	for field in tree.props:
		if field.endswith('_sum'):
			tree.props[field] = sum(cast_iterable(tree.props[field], type_map[field]))
		elif field.endswith('_set'):
			tree.props[field] = set(cast_iterable(tree.props[field], type_map[field]))
		else:
			tree.props[field] = list(cast_iterable(tree.props[field], type_map[field]))

	# Recurse, merge in children
	for path, node in tree.children.items():
		cast_and_reduce(path, node, type_map, level=level + 1)

		for field in set(tree.props.keys()).union(node.props.keys()):
			if field.endswith('_sum'):
				tree.props[field] = tree.props.get(field, 0) + node.props.get(field, 0)
			elif field.endswith('_set'):
				tree.props[field] = tree.props.get(field, set()).union(node.props.get(field, set()))


def calculate_field_width(path, tree, max_length, type_map, max_level, level=-1):
	def handle_path_entry(path, hide):
		if hide:
			display = '[..]'
		else:
			display = '/' + path

		cur_max_len = max_length.get('path', 0)
		candidate = len(output_string(indent(level) + display))
		if (candidate > cur_max_len) or (cur_max_len == 0):
			max_length['path'] = candidate


	if level >= max_level + 1:
		for k, v in tree.children.items():
			if v.children:
				handle_path_entry('', hide=True)
		return

	if level >= 0:
		# Calculate length for path
		handle_path_entry(path, hide=ignore(path))

		# Calculate length for non-path
		for k in tree.props:
			v = tree.props[k]
			cur_max_len = max_length.get(k, 0)
			candidate = len(output_string(v))

			if (candidate > cur_max_len) or (cur_max_len == 0):
				max_length[k] = candidate

	# Recurse
	for k, v in tree.children.items():
		calculate_field_width(k, v, max_length, type_map, max_level, level=level + 1)


def command_line():
	parser = argparse.ArgumentParser(description='Converts a list of paths to a tree')
	parser.add_argument(
		'--depth',
		dest='max_depth', metavar='DEPTH', action='store', type=int, default=-1,
		help='Maximum depth to print (starting at 0)')
	parser.add_argument(
		'--no-header',
		dest='print_header', action='store_false', default=True,
		help='Suppress output of table header')
	parser.add_argument(
		'--debug',
		dest='debug', action='store_true', default=False,
		help='Print debugging information')
	parser.add_argument('pattern',
		metavar='PATTERN', nargs='?', action='store', default=_DEFAULT_PATTERN,
		help='Pattern to use for extraction (default: "%(default)s")')
	parser.add_argument('format',
		metavar='FORMAT', nargs='?', action='store', default=None,
		help='(Python-style) format string to use for output (default derived from pattern)')
	return parser.parse_args()


def sanitize_args(args):
	if args.max_depth < 0:
		args.max_depth = sys.maxint

	if args.pattern == _DEFAULT_PATTERN:
		args.print_header = False


if __name__ == '__main__':
	args = command_line()

	sanitize_args(args)

	try:
		extractor = re.compile(args.pattern)
	except re.error as e:
		print("Invalid expression: '%s'" % sys.argv[1], file=sys.stderr)
		sys.exit(1)


	tree = Node()
	type_map = {'path':'str'}

	# Collect and guess field types
	for l in sys.stdin:
		line = l.rstrip()
		x = extractor.match(line)
		if x is None:
			print("Pattern mismatch on line ..", file=sys.stderr)
			print(line, file=sys.stderr)
			sys.exit(1)

		gd = x.groupdict()
		if 'path' not in gd:
			print("Pattern lacks (?P<path>something)", file=sys.stderr)
			sys.exit(1)

		# Guess field type (again and again)
		for k, v in gd.items():
			if k == 'path':
				continue
			if v is None:
				type_map[k] = 'int'  # strictest type of all
				continue
			if (not k in type_map) or (type_map[k] != 'str'):
				_type = 'str'
				try:
					int(v)
					_type = 'int'
				except ValueError as e:
					try:
						float(v)
						_type = 'float'
					except ValueError as e:
						pass
				type_map[k] = _type

		# NOTE: "/foo/" -> ["", "foo"]
		_parts = simplify(gd['path']).split("/")
		cur_tree = tree
		level_count = len(_parts)
		for level, part in enumerate(_parts):
			# Propagate and recurse
			if not part in cur_tree.children:
				cur_tree.children[part] = Node()

			is_last = (level == level_count - 1)
			if is_last:
				colect_props(cur_tree.children[part].props, gd)
			else:
				cur_tree = cur_tree.children[part]

	cast_and_reduce("", tree, type_map)

	# Derive default format to show all fields
	if args.format is None:
		keys = sorted(k for k in type_map.keys() if k != 'path') + ['path', ]
		args.format = '  '.join('%%(%s)s' % k for k in keys)
		if args.debug:
			print("INFO: Using derived output format '%s'" % args.format, file=sys.stderr)

	max_length = dict()

	if args.print_header:
		for k in type_map:
			max_length[k] = len(k)

	calculate_field_width("", tree, max_length, type_map, max_level=args.max_depth)

	if args.print_header:
		header_title = dict()
		for k, v in type_map.items():
			if v in ('int', 'float'):
				sign = '' # align right
			else:
				sign = '-' # align left
			header_title[k] = ('%%%s%ds' % (sign, max_length[k])) % k

		s = args.format % header_title
		print(s)

		space_width = (len(max_length) - 1) * 2
		data_width = sum(v for k, v in max_length.items())
		print('=' * (data_width + space_width))

	dump(tree, args.format, max_length, max_level=args.max_depth)
