#!/usr/bin/python
# AskMerlin is a weekend hack I did by putting together a short program called NLQ,
# or natural language query, which can be found online at http://gurno.com/adam/nlq/#download
# NLQ is a Class to take an inputed query and output 1. Kewords and 2. also to categorize
# the type of question being asked. I am primarily interested in using the Keywords
# extracted form a query by NLQ. I shamellesly modified NLQ to add many more
# IGNORE_WORDS and otherwise spruce it up.
# Next, I ultilized the multiChoise Guesser script that some nice person posted
# on the newsgroup a couple weesk ago. This uses urllib to go out to the web
# and judge teh appropriateness of a given answer by how many hits it gets on Google
# when coupled with the origninal question in a Googel search.
# My main contribution, such as it was, was simply to enable the program to create
# choices in order to choose a most appropriate answer. This is done by using
# NLQ to pick out Keywords form teh HTML page returned by Google in its search
# on the question. This version, AskMerlin, currently takes a long time
# to come to an answer because it must do amny web searches/ I have also a simpler version
# called aNoDivisorAskMerlin that comes to a much quicker decision, btu is not as good in the
# appropriateness of its answer's. In either case, you must wait pateintly while
# merlin considers his options...dependign on speed of yoru internet connection.
# On my cable modem at home, AsKMerlin takes about three minutes.
# I am hoping that someone or other might give me ideas on how to improve the
# intelligence behind Merlin's delibrations. This version is just a hack to see if it works
# in principle. I have other ideas to increase "intelligence" but I need mroe.
# Currently, Merlin is very low IQ, but he has potential for the future.
# Anyway, Merlin can already answer just about any question.
# Someday, perhaps he will even answer correctly or at least with wisdom
# ;-)))))))))))))


# NLQ:
# This is just a bunch of ideas tossed together by someone who doesn't know
# anything about NLQs or even if these "Natural Language Queries" are referred 
# to as NLQs.  Take it all with a grain of salt. 

# I needed something that could respond semi-intelligently to questions posed to
# it.  Quick and Dirty as they say.

# NLQ.py is still rather dumb.  Don't get fancy.  "Would you please explain to 
# me why the sky is blue?" will not get you what you want.  "Why is the sky
# blue?" is much better.
import urllib
import re

import string, sys

# stuff
__version__ = "0.1"

#definine the question types...
UNKNOWN = 0
KNOWLEDGE = 1
COMPREHENSION = 2
APPLICATION = 3
ANALYSIS = 4
SYNTHESIS = 5
EVALUATION = 6

KNOWLEDGE_WORDS = ["name", 
		   "list", 
		   "recall", 
		   "define", 
		   "tell", 
		   "match", 
		   "who", 
		   "what", 
		   "when", 
		   "describe", 
		   "where"] 

COMPREHENSION_WORDS = ["retell"] 
APPLICATION_WORDS = ["why"] 
ANALYSIS_WORDS = ["how",
		"classify", 
		"outline",
		"diagram"] 
SYNTHESIS_WORDS = [] 
EVALUATION_WORDS = []

PRONOUNS = ["he",
	"she",
	"it",
	"me",
	"you",
	"they",
	"them",
	"we",
	"who",
	"myself",
	"yourself",
	"ourself",
	"I", 
	"me",
	"my"]

VERBS = ["is",
	 "was",
	 "are",
	 "were",
	 "be",
	 "shall",
	 "am",
	 "isn't",
	 "can't",
	 "won't",
	 "shouldn't",
	 "couldn't",
	 "aren't",
	 "do",
	 "don't",
	 ]

OTHER_WORDS = ["if",
		"to",
		"too",
		"there",
		"will",
		"the",
		"a",
		"let",
		"I'll",
		"this",
		"these",
		"those",
		"let",
		"*.",
	       "+*",
	       ".*",
	       "<*",
	       ">*",
	       "=*",
	       "*=",
	       "*<",
	       "*>",
	       "*.",
	       "*-",
	       "-*",
	       "*:",
	       ":*",
	       ";*",
	       "*;",
	       "*,",
	       ",*",
	       "*.*",
	       "*,*",
	       "*;*",
	       "*:*",
	       "*+*",
	       "*=*",
	       "*-*",
	       "*_*",
	       "*<*",
	       "*>*",
	       "*?*",
	       "*/*",
	       "of",
	       "and",
	       "for",
	       "very",
	       "not",
	       "in",
	       "on",
	       "up",
	       "has",
	       "from",
	       "which",
	       "and",
	       "on",
	       "of",
	       "or",
	       "not",
	       "by",
	       "can",
	       "that",
	       "your",
	       "with",
	       "their",
	       "over",
	       "back",
	       "link",
	       "about",
	       "an",
	       "at",
	       "his",
	       "enter",
	       "into",
	       "so",
	       "was",
	       "a",
	       "as"]		

IGNORE_WORDS = VERBS + PRONOUNS + OTHER_WORDS + KNOWLEDGE_WORDS + COMPREHENSION_WORDS + APPLICATION_WORDS + ANALYSIS_WORDS

def determine_type (word):
	# for right now this only matches the first word.  Soon it will 
	# take the whole string and attempt to match using that.
	return_type = UNKNOWN
	if word in KNOWLEDGE_WORDS:
		return_type = KNOWLEDGE
	elif word in APPLICATION_WORDS:
		return_type = APPLICATION
	elif word in ANALYSIS_WORDS:
		return_type = ANALYSIS
	elif word in SYNTHESIS_WORDS:
		return_type = SYNTHESIS
	elif word in EVALUATION_WORDS:
		return_type = EVALUATION
	elif word in COMPREHENSION_WORDS:
		return_type = COMPREHENSION
	return return_type

class NLQ:
	def __init__(self, a_string):
		self.tuple = string.split(string.lower(a_string))
		self.type = determine_type (self.tuple[0])
		self.keywords = []
		
		for word in self.tuple[1:]:

			if "~" in word:
				continue
			if "@" in word:
				continue
			if "#" in word:
				continue
			if "$" in word:
				continue
			if "%" in word:
				continue
			if "^" in word:
				continue
			if "&" in word:
				continue
			if "<" in word:
				continue
			if ">" in word:
				continue
			if ":" in word:
				continue
			if ";" in word:
				continue
			if "{" in word:
				continue
			if "}" in word:
				continue
			if "[" in word:
				continue
			if "*" in word:
				continue
			if "(" in word:
				continue
			if ")" in word:
				continue
			if "_" in word:
				continue
			if "-" in word:
				continue
			if "+" in word:
				continue
			if "=" in word:
				continue
			if "?" in word:
				continue
			if "for" == word:
				continue
			if word in IGNORE_WORDS:
				continue
			if word in OTHER_WORDS:
				continue
			if word in VERBS:
				continue
			if word in PRONOUNS:
				continue
			if "and" == word:
				continue

			if word[0] not in string.letters:
				continue
				
			
			if word[-1] not in string.letters:
				word = word[:-1]

			
			else:
				self.keywords.append (word)

			
	def __repr__(self):
		return "type: %s\nkeywords: %s" % (self.type, self.keywords)
		

class multiChoiceGuesser:

    def __init__(self, question='', replys=[]):
        self.question = question
        self.replys   = replys

    def guessedAnswer(self):
        hits = []
        
        result = []
   
       
        for reply in self.replys:
		x = (self._getGoogleHits(self.question + ' ' + reply))
		y = (self._getGoogleHits(reply))

		if y == 0:
			y = y + 1
		dividend = x / y
		hits.append(dividend)
        
        return hits.index(max(hits))

    def _getGoogleHits(self, query):
        query = urlencode({'q':query})
        urlHandle = urlopen ('http://www.google.com/search?%s' % query)
        googlePage = urlHandle.read()
        try:
            numberAsString = re.search(
                'about',  googlePage, re.S
                ).group(1)
            hits = re.sub(',', '',numberAsString)
            urlHandle.close()
            hits = int(hits)
        except:
            hits = 0
        return hits

    def _getGoogleHits(self, query):
            query = urllib.urlencode({'q':query})
            urlHandle = urllib.urlopen('http://www.google.com/search?%s' % query)
            googlePage = urlHandle.read()
            try:
                numberAsString = re.search(
                    'about', googlePage, re.S
                    ).group(1)
                hits = re.sub   (',', '',numberAsString)
                urlHandle.close()
                hits = int(hits)
            except:
                hits = 0
            return hits


def _getGooglePage(query):
        query = urllib.urlencode({'q':query})
        urlHandle = urllib.urlopen('http://www.google.com/search?%s' % query)
        googlePage = urlHandle.read()
        
        return googlePage


def guess(question, replys):
    mcg = multiChoiceGuesser(question, replys)
    print 'The question is: ', question
    print 'The most likely answer is: ', replys[mcg.guessedAnswer()]
    print ''

question = raw_input ("What is your question?")

source = _getGooglePage(question)

b = NLQ(source)

replys = b.keywords

u = NLQ(question)

bad = u.keywords

for thing in bad:
	replys.remove(thing)

del replys[:13]

del replys[-13:]

print replys

guess(question, replys)