#!/usr/bin/python

# This is just a bunch of ideas tossed together by someone who doesn't know
# anything about NLQs or even if these "Natural Language Queries" are referred 
# to as NLQs.  Take it all with a grain of salt. 

# I needed something that could respond semi-intelligently to questions posed to
# it.  Quick and Dirty as they say.

# NLQ.py is still rather dumb.  Don't get fancy.  "Would you please explain to 
# me why the sky is blue?" will not get you what you want.  "Why is the sky
# blue?" is much better.
import urllib
import re

import string, sys

# stuff
__version__ = "0.1"

#definine the question types...
UNKNOWN = 0
KNOWLEDGE = 1
COMPREHENSION = 2
APPLICATION = 3
ANALYSIS = 4
SYNTHESIS = 5
EVALUATION = 6

KNOWLEDGE_WORDS = ["name", 
		   "list", 
		   "recall", 
		   "define", 
		   "tell", 
		   "match", 
		   "who", 
		   "what", 
		   "when", 
		   "describe", 
		   "where"] 

COMPREHENSION_WORDS = ["retell"] 
APPLICATION_WORDS = ["why"] 
ANALYSIS_WORDS = ["how",
		"classify", 
		"outline",
		"diagram"] 
SYNTHESIS_WORDS = [] 
EVALUATION_WORDS = []

PRONOUNS = ["he",
	"she",
	"it",
	"me",
	"you",
	"they",
	"them",
	"we",
	"who",
	"myself",
	"yourself",
	"ourself",
	"I", 
	"me",
	"my",
	"his"]

VERBS = ["is",
	 "was",
	 "are",
	 "were",
	 "be",
	 "shall",
	 "am",
	 "isn't",
	 "can't",
	 "won't",
	 "shouldn't",
	 "couldn't",
	 "aren't",
	 "do",
	 "don't",
	 ]

OTHER_WORDS = ["if",
		"to",
		"too",
		"there",
		"will",
		"the",
		"a",
		"let",
		"I'll",
		"this",
		"these",
		"those",
		"let",
		"*.",
	       "+*",
	       ".*",
	       "<*",
	       ">*",
	       "=*",
	       "*=",
	       "*<",
	       "*>",
	       "*.",
	       "*-",
	       "-*",
	       "*:",
	       ":*",
	       ";*",
	       "*;",
	       "*,",
	       ",*",
	       "*.*",
	       "*,*",
	       "*;*",
	       "*:*",
	       "*+*",
	       "*=*",
	       "*-*",
	       "*_*",
	       "*<*",
	       "*>*",
	       "*?*",
	       "*/*",
	       "of",
	       "and",
	       "for",
	       "very",
	       "not",
	       "in",
	       "on",
	       "up",
	       "has",
	       "from",
	       "which",
	       "and",
	       "on",
	       "of",
	       "or",
	       "not",
	       "by",
	       "can",
	       "that",
	       "your",
	       "with",
	       "their",
	       "over",
	       "back",
	       "link",
	       "about",
	       "an",
	       "at",
	       "his",
	       "into",
	       "enter",
	       "so",
	       "was",
	       "a",
	       "as"]		

IGNORE_WORDS = VERBS + PRONOUNS + OTHER_WORDS + KNOWLEDGE_WORDS + COMPREHENSION_WORDS + APPLICATION_WORDS + ANALYSIS_WORDS

def determine_type (word):
	# for right now this only matches the first word.  Soon it will 
	# take the whole string and attempt to match using that.
	return_type = UNKNOWN
	if word in KNOWLEDGE_WORDS:
		return_type = KNOWLEDGE
	elif word in APPLICATION_WORDS:
		return_type = APPLICATION
	elif word in ANALYSIS_WORDS:
		return_type = ANALYSIS
	elif word in SYNTHESIS_WORDS:
		return_type = SYNTHESIS
	elif word in EVALUATION_WORDS:
		return_type = EVALUATION
	elif word in COMPREHENSION_WORDS:
		return_type = COMPREHENSION
	return return_type

class NLQ:
	def __init__(self, a_string):
		self.tuple = string.split(string.lower(a_string))
		self.type = determine_type (self.tuple[0])
		self.keywords = []
		
		for word in self.tuple[1:]:

			if "~" in word:
				continue
			if "@" in word:
				continue
			if "#" in word:
				continue
			if "$" in word:
				continue
			if "%" in word:
				continue
			if "^" in word:
				continue
			if "&" in word:
				continue
			if "<" in word:
				continue
			if ">" in word:
				continue
			if ":" in word:
				continue
			if ";" in word:
				continue
			if "{" in word:
				continue
			if "}" in word:
				continue
			if "[" in word:
				continue
			if "*" in word:
				continue
			if "(" in word:
				continue
			if ")" in word:
				continue
			if "_" in word:
				continue
			if "-" in word:
				continue
			if "+" in word:
				continue
			if "=" in word:
				continue
			if "?" in word:
				continue
			if "for" == word:
				continue
			if word in IGNORE_WORDS:
				continue
			if word in OTHER_WORDS:
				continue
			if word in VERBS:
				continue
			if word in PRONOUNS:
				continue
			if "and" == word:
				continue

			if word[0] not in string.letters:
				continue
				
			
			
				
			

			if word[-1] not in string.letters:
				word = word[:-1]

			
			else:
				self.keywords.append (word)

			
		
	def __repr__(self):
		return "type: %s\nkeywords: %s" % (self.type, self.keywords)
		












class multiChoiceGuesser:

    def __init__(self, question='', replys=[]):
        self.question = question
        self.replys   = replys

    def guessedAnswer(self):
        hits = []
        
        
       
        for reply in self.replys:
		
		hits.append(self._getGoogleHits(self.question + ' ' + reply))
        
        return hits.index(max(hits))

    def _getGoogleHits(self, query):
        query = urllib.urlencode({'q':query})
        urlHandle = urllib.urlopen('http://www.google.com/search?%s' % query)
        googlePage = urlHandle.read()
        try:
            numberAsString = re.search(
                'of about &lt;b&gt;(.*?)&lt;/b&gt;.', googlePage, re.S
                ).group(1)
            hits = re.sub(',', '',numberAsString)
            urlHandle.close()
            hits = int(hits)
        except:
            hits = 0
        return hits




def _getGooglePage(query):
        query = urllib.urlencode({'q':query})
        urlHandle = urllib.urlopen('http://www.google.com/search?%s' % query)
        googlePage = urlHandle.read()
        
        return googlePage




def guess(question, replys):
    mcg = multiChoiceGuesser(question, replys)
    print 'The question is: ', question
    print 'The most likely answer is: ', replys[mcg.guessedAnswer()]
    print ''

question = raw_input ("What is your question?")

source = _getGooglePage(question)

b = NLQ(source)

replys = b.keywords

u = NLQ(question)

bad = u.keywords

for thing in bad:
	replys.remove(thing)

del replys[:13]

del replys[-13:]

print replys

guess(question, replys)