#!/usr/bin/python
#OK, I had to re-write askMerlin form scratch, due to Google's
#disabling of the functionality that the program relied on. It 
# seesm that merlin was violating Google's terms of service, 
#unbeknownst to me, that's for sure! But I do not want to violate 
#anyone's terms of service, so I re-wrote askMerlin using
# Yahoo instead. I also came up with a completely new 
#web-scraping algorithm, using string functions instead
# of Regular Expressions. Simple is better than complicated!
#
# AskMerlin is a script I did by putting together two scripts and
#modfying them both
# and adding input/output routines around them.
#
# First, I ultilized the multiChoiceGuesser script that Max M posted
# on the newsgroup comp.lang.python a couple weeks ago. This uses urllib
#to go out to
# the web and judge the appropriateness of a given answer by how many
#hits it gets on Google
# when coupled with the origninal question in a Google search.
# My contributions were to enable the program to ask
# for both an original question, and then for options to choose from. I
#also set up a small
# routine in order to choose a most appropriate answer, in the case that
#no options are given.
# This is done by using the second program, to create options of its own
#to choose from
# NLQ to pick out Keywords from the page returned by a Google search of
#the question, by itself.
# Then, these keyworsa are used as options or possible answers to the
#question.
# Then, multiChoiceGuesser is applied to the question along with all of
#the Keywords
# generated by NLQ. The result can take a long time, but eventually it
#gets there, always. (???)
# Also, I added to multiChoiceGuesser the requirement to do two google
#searches, one on
# the original question and each option, and one on the option by
#itlself. Then
# we calculate a ratio between each option's Google hit score and its
#question/option
# Google hit score, thus avoiding merely choosing the option that has
#overwhelmingly high hits
# all by itself.
#
# Surely better algorithms can vastly improve thsi program!!!
#
# I am hoping some one or some folks come up with improved variatiosn
#and algorithtms
#
# Various algoritms could be tried, and then the results from the
#various algoritms could be
# averaged in order to produce more accurate results.
#
#
# Currently, Merlin is may have a low IQ, but he has potential for the
#future.
# Anyway, Merlin can already answer just about any question.
# Someday, perhaps he will even answer correctly or at least with
#wisdom. most or all
# of the time.
# ;-)))))))))))))
#
#
# NLQ:
# a short program called NLQ,
# or natural language query, which can be found online at
#http://gurno.com/adam/nlq/#download
# NLQ is a Class to take an inputted query and output 1. Keywords and 2.
#also to categorize
# the type of question being asked. I am primarily interested in using
#the Keywords
# extracted from a query by NLQ. I shamelesly modified NLQ to add many
#more
# IGNORE_WORDS and otherwise spruce it up.
#
# NLQ.py is still rather dumb, but hey, he has potential ;-))))).


from urllib import *
import re

import string, sys

# stuff
__version__ = "0.1"

#definine the question types...
UNKNOWN = 0
KNOWLEDGE = 1
COMPREHENSION = 2
APPLICATION = 3
ANALYSIS = 4
SYNTHESIS = 5
EVALUATION = 6

KNOWLEDGE_WORDS = ["name",
                   "list",
                   "recall",
                   "define",
                   "tell",
                   "match",
                   "who",
                   "what",
                   "when",
                   "describe",
                   "where"]

COMPREHENSION_WORDS = ["retell"]
APPLICATION_WORDS = ["why"]
ANALYSIS_WORDS = ["how",
                "classify",
                "outline",
                "diagram"]
SYNTHESIS_WORDS = []
EVALUATION_WORDS = []

PRONOUNS = ["he",
        "she",
        "it",
        "me",
        "you",
        "they",
        "them",
        "we",
        "who",
        "myself",
        "yourself",
        "ourself",
        "I",
        "me",
        "my"]

VERBS = ["is",
         "was",
         "are",
         "were",
         "be",
         "shall",
         "am",
         "isn't",
         "can't",
         "won't",
         "shouldn't",
         "couldn't",
         "aren't",
         "do",
         "don't",
         ]

OTHER_WORDS = ["if",
                "to",
                "too",
                "there",
                "will",
                "the",
                "a",
                "let",
                "I'll",
                "this",
                "these",
                "those",
                "let",
                "*.",
               "+*",
               ".*",
               "<*",
               ">*",
               "=*",
               "*=",
               "*<",
               "*>",
               "*.",
               "*-",
               "-*",
               "*:",
               ":*",
               ";*",
               "*;",
               "*,",
               ",*",
               "*.*",
               "*,*",
               "*;*",
               "*:*",
               "*+*",
               "*=*",
               "*-*",
               "*_*",
               "*<*",
               "*>*",
               "*?*",
               "*/*",
               "of",
               "and",
               "for",
               "very",
               "not",
               "in",
               "on",
               "up",
               "has",
               "from",
               "which",
               "and",
               "on",
               "of",
               "or",
               "not",
               "by",
               "can",
               "that",
               "your",
               "with",
               "their",
               "over",
               "back",
               "link",
               "about",
               "an",
               "at",
               "his",
               "enter",
               "into",
               "so",
               "was",
               "a",
               "as",
               "but"]

IGNORE_WORDS = VERBS + PRONOUNS + OTHER_WORDS + KNOWLEDGE_WORDS + COMPREHENSION_WORDS + APPLICATION_WORDS + ANALYSIS_WORDS

def determine_type (word):
        # for right now this only matches the first word.  Soon it will
        # take the whole string and attempt to match using that.
        return_type = UNKNOWN
        if word in KNOWLEDGE_WORDS:
                return_type = KNOWLEDGE
        elif word in APPLICATION_WORDS:
                return_type = APPLICATION
        elif word in ANALYSIS_WORDS:
                return_type = ANALYSIS
        elif word in SYNTHESIS_WORDS:
                return_type = SYNTHESIS
        elif word in EVALUATION_WORDS:
                return_type = EVALUATION
        elif word in COMPREHENSION_WORDS:
                return_type = COMPREHENSION
        return return_type

class NLQ:
        def __init__(self, a_string):
                self.tuple = string.split(string.lower(a_string))
                self.type = determine_type (self.tuple[0])
                self.keywords = []

                for word in self.tuple[1:]:

                        if "~" in word:
                                continue
                        if "@" in word:
                                continue
                        if "#" in word:
                                continue
                        if "$" in word:
                                continue
                        if "%" in word:
                                continue
                        if "^" in word:
                                continue
                        if "&" in word:
                                continue
                        if "<" in word:
                                continue
                        if ">" in word:
                                continue
                        if ":" in word:
                                continue
                        if ";" in word:
                                continue
                        if "{" in word:
                                continue
                        if "}" in word:
                                continue
                        if "[" in word:
                                continue
                        if "*" in word:
                                continue
                        if "(" in word:
                                continue
                        if ")" in word:
                                continue
                        if "_" in word:
                                continue
                        if "-" in word:
                                continue
                        if "+" in word:
                                continue
                        if "=" in word:
                                continue
                        if "?" in word:
                                continue
                        if "for" == word:
                                continue
                        if word in IGNORE_WORDS:
                                continue
                        if word in OTHER_WORDS:
                                continue
                        if word in VERBS:
                                continue
                        if word in PRONOUNS:
                                continue
                        if "and" == word:
                                continue

                        if word[0] not in string.letters:
                                continue


                        if word[-1] not in string.letters:
                                word = word[:-1]


                        else:
                                self.keywords.append (word)


        def __repr__(self):
                return "type: %s\nkeywords: %s" % (self.type, self.keywords)


class multiChoiceGuesser:

    def __init__(self, question='', replys=()):
        self.question = question
        self.replys   = replys

    def guessedAnswer(self):
        hits = []

        result = []


        for reply in self.replys:
                x = (self._getGoogleHits(self.question + ' ' + reply))
                y = (self._getGoogleHits(reply))
                x = float(x)
                y = float(y)
               
                if x == 0:
                        x = x + 1
                dividend  = y / x
                
                hits.append(dividend)
               
        return hits.index(min(hits))
        
    def _getGoogleHits(self, query):
        query = urlencode({'p':query})
       
        urlHandle = urlopen('http://search.yahoo.com/bin/search?%s' % (query))

        googlePage = urlHandle.read()

        position  = googlePage.index("out of about")
       

        new =  googlePage[position:]
       

        neww = new[13:]

        newww = neww[:26]
      
        spot = newww.index("<")
        eureka = newww[:spot]
      
      
        hits = re.sub(',', '', eureka)

        urlHandle.close()

        hits = int(hits)
     
        return hits

def guess(question, choices):
    mcg = multiChoiceGuesser(question, choices)
    print ' The question is: ', question
    print " Please wait for Merlin's  answer: ", choices[mcg.guessedAnswer()]
    print ''

def get_list(heading, prompt):

        print heading
        print
        print "(enter a blank line to end the list)"
        ret = []
        i = 1
        while 1:
                line = raw_input(prompt % i)
                if not line:
                        break
                ret.append(line)
                i=i+1
        print
        return ret


question = raw_input ("What is your question?")


choices = get_list("Enter your options:", "Option %d: ")

if choices == []:

        print """Since you did not give Merlin any options, it may take a
while as he thinks. Please be patient; if you do not touch your keyboard
or mouse for a few minutes, Merlin will respond ;-)))))"""
 
        urlHandle = urlopen('http://search.yahoo.com/bin/search?%s' % (question))       
        source = urlHandle.read()

        b = NLQ(source)

        choices = b.keywords

        u = NLQ(question)

        bad = u.keywords

        
        del choices[:1]

        del choices[-1:]
     

guess(question, choices)

while 1:


        question = raw_input ("what is your next question?")


        choices = get_list("Enter your options:", "Option %d: ")

        if not choices:

                print """Since you did not give Merlin any options, it may
take a while as he thinks. Please be patient and if you do not touch
your keyboard or mouse for a few minutes, Merlin will respond."""


                urlHandle = urlopen('http://search.yahoo.com/bin/search?%s' % (question))
                source = urlHandle.read()
                

                b = NLQ(source)

                choices = b.keywords

                u = NLQ(question)

                bad = u.keywords


                del choices[:1]

                del choices[-1:]
             
                
        guess(question, choices)