autoAidModules/search_funcs.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101

import os

from serpapi import GoogleSearch
from .sample_res import res
from boilerpy3 import extractors
from fake_useragent import UserAgent
from langchain.llms import Bedrock
from langchain.prompts.prompt import PromptTemplate

import requests

extractor = extractors.ArticleExtractor()

preferred_forums = {
    "BMW": ["bimmerforums.com"],
    "Subaru": ["nasioc.com"]
}

llm = Bedrock(model_id="anthropic.claude-instant-v1")
ua = UserAgent()

"""
Website data: 
[

{
    "title":"",
    "link": "",
    "date": "", # prioritise older posts for older cars?,
    "full-text": "",
},

]
"""

def find_preferred_forums(make):
    if make not in preferred_forums:
        template = "Human:  If BMW: bimmerforums.com, Subaru: nasioc.com, Mazda: forum.miata.net What is the best forum for {make}? No more explanation\n\nAssistant: Then {make}:"
        prompt = PromptTemplate(input_variables=["make"], template=template)
        pred = llm.predict(prompt.format(make=make), max_tokens_to_sample=30, temperature=1,top_k=250, top_p=0.999)
        make_url = pred.strip().split()[0]
        print(f"Found {make_url} for {make}")
        preferred_forums[make] = [make_url]
    return preferred_forums[make]

def get_preferred_forums(make):
    if make not in preferred_forums:
        return find_preferred_forums(make)
    return preferred_forums[make]

def parse_page(url):
    content = extractor.get_content_from_url(url)
    return content

def get_tasks_from_pages(pages: list = [], query: str = "", details: str = ""):
    template =  "Human: You are an beginner mechanic. You are trying to solve the problem of {query} and have a {details}.\n Generate simple tasks from the following pages:\n {pages}\n\nAssistant: I would try all of the following, one by one:\n\n- Have you tried turning your car on and off?\n- "
    prompt_template = PromptTemplate(input_variables=["query", "details", "pages"], template=template)

    
    pred = llm.predict(
        prompt_template.format(
            query=query, details=details, pages=pages
                ), max_tokens_to_sample=501, temperature=1,top_k=250, top_p=0.999
                    )
    pred = "- " + pred
    print(pred)
    return pred


def search_on_forum(forum, query, max_results: int = 5):
    params = {
        "q": query + f" {forum}",
        "location": "Austin, Texas, United States",
        "hl": "en",
        "gl": "us",
        "google_domain": "google.com",
        "api_key": os.environ.get("SERP_API_KEY", "demo")
    }
    #search = GoogleSearch(params)
    #results = search.get_dict()

    results = res # Debugging Data
    if results["search_metadata"]['status'] == "Success":
        data = []
        for idx, result in enumerate(results["organic_results"]):
            if idx >= max_results:
                break
            new_dict = {
                "title": result["title"],
                "link": result["link"],
                "full-text": ""
            }
            try:
                resp = requests.get(result["link"], headers={"User-Agent": ua.random})
                new_dict["full-text"] = extractor.get_content(resp.text)
            except Exception as e:
                print(f"Error parsing page {result['link']}: {e}")
            data.append(new_dict)
        return data
    else:
        return []