import sys, random

# Beardy: Uses Markov chains to generate random sentences given a particular
# piece of test data. Fun stuff.

def weightedPick(d):
	"""
	Pick a random item from a dictionary containing item : weight pairs, assumming
	the weights are normalized. Raises an exception if no element could be
	piecked.
	"""
	rand = random.random()
	s = 0
	for key, weight in d.iteritems():
		s += weight
		if rand <= s:
			return key
	raise Exception("No element picked")

class Word(object):
	"""
	A chain of words which also contains references to word chains which follow
	it along with the number of occurrences of each. It is also able to output a
	normalized version.
	"""
	def __init__(self, word):
		# The word-chain this object represents
		self.word = word
		# The dictionary containing Word : frequency for all words this is related
		# to.
		self.rawRelations = {}
		
		# A cached dictionary which is a normalized version of the above
		self._relations = {}
	
	def __hash__(self):
		"""
		Hash this word chain -- note that the relations are irelevent
		"""
		return self.word.__hash__()
	
	def followedBy(self, otherWord):
		"""
		Count the fact that this worchain is followed by the passed wordchain.
		"""
		if self.rawRelations.has_key(otherWord):
			self.rawRelations[otherWord] += 1
		else:
			self.rawRelations[otherWord] = 1
	
	@property
	def relations(self):
		"""
		A normalized version of the dictionary of the wordschains which can follow
		this one.
		"""
		if len(self._relations) != len(self.rawRelations):
			normalizedRelations = {}
			
			sumOfFrequencies = float(sum(self.rawRelations.itervalues()))
			for word, frequency in self.rawRelations.iteritems():
				normalizedRelations[word] = float(frequency) / sumOfFrequencies
			self._relations = normalizedRelations
		return self._relations
	
	def __str__(self):
		return str(self.word)
	__repr__=__str__

class Beard(object):
	"""
	A markov process.
	"""
	def __init__(self, order = 2):
		# A list of words found by the system
		self.words = {}
		
		# The oder of the process to use. Must be 1 or greater
		self.order = order
	
	def getWord(self, word):
		"""
		Return a refrence to a Word object which represents the provided wordchain.
		If one does not exist, create it.
		"""
		if not self.words.has_key(word):
			self.words[word] = Word(word)
		return self.words[word]
	
	def train(self, trainingData):
		"""
		Generate markov chains for a given string using each sentence as an
		individual piece of training data.
		"""
		for sentence in trainingData.strip().split("."):
			# If this sentence isn't empty
			words = sentence.strip().split()
			if len(words) != 0:
				# Append an ending word of "." which is used to signal the end of a
				# chain
				words.append(".")
				
				# Strip out any empty words (e.g. double spaces etc.)
				words = filter((lambda x : x != ""), words)
				
				# Generate a moving grouping of words apropriate for the current order
				# size. E.g. a b c d becomes ab bc cd for order 2.
				wordGroups = [
					tuple([ word for word in words[offset : offset + self.order] ])
					for offset in range (len(words) - self.order + 1)
				]
				
				# The "start" of any chain is the empty word
				lastWord = self.getWord(tuple([""]))
				
				# Loop through all the words, each time adding it to the previous word's
				# refrence list.
				for wordGroup in wordGroups:
					thisWord = self.getWord(wordGroup)
					lastWord.followedBy(thisWord)
					lastWord = thisWord
	
	@property
	def digraph(self):
		"""
		The digraph representing this chain in GraphViz format.
		"""
		digraph = ""
		digraph = "digraph beard {\n"
		words = self.words
		for word in words:
			relations = words[word].relations
			for relation in relations:
				digraph += "\"%s\" -> \"%s\" [label=\"%0.3f\"];\n"%(
					",".join(word),
					",".join(relation.word),
					relations[relation]
				)
		digraph += "}"
		return digraph
	
	def generateSentence(self):
		"""
		Generate a random sentence using this chain.
		"""
		output = ""
		word = self.getWord(tuple([""]))
		endWord = self.getWord(tuple(["."]))
		while word.__hash__() != endWord.__hash__():
			if len(word.relations) != 0:
				output += word.word[0] + " "
				word = weightedPick(word.relations)
			else:
				output += " ".join(word.word)
				break
		return output


if __name__=="__main__":
	if len(sys.argv) == 4:
		inFile = open(sys.argv[1], "r")
		trainingData = inFile.read()
		
		beard = Beard(int(sys.argv[2]))
		beard.train(trainingData)
		
		if sys.argv[3] == "digraph":
			print beard.digraph
		else:
			while True:
				print beard.generateSentence()
				raw_input()
	else:
		print "usage: trainingfile order (digraph|generate)"