From 61a48906f26d7074f7ad64f221d3948cb84f21ea Mon Sep 17 00:00:00 2001 From: Navan Chauhan Date: Sat, 7 Oct 2023 22:46:24 -0600 Subject: py module --- autoAidModules/search_funcs.py | 76 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 autoAidModules/search_funcs.py (limited to 'autoAidModules/search_funcs.py') diff --git a/autoAidModules/search_funcs.py b/autoAidModules/search_funcs.py new file mode 100644 index 0000000..241872e --- /dev/null +++ b/autoAidModules/search_funcs.py @@ -0,0 +1,76 @@ +from serpapi import GoogleSearch +from .sample_res import res +from boilerpy3 import extractors +from fake_useragent import UserAgent + +import requests + +extractor = extractors.ArticleExtractor() + +preferred_forums = { + "BMW": ["bimmerforums.com"] +} + +ua = UserAgent() + +""" +Website data: +[ + +{ + "title":"", + "link": "", + "date": "", # prioritise older posts for older cars?, + "full-text": "", +}, + +] +""" + +def find_preferred_forums(make): + if make not in preferred_forums: + return None + return preferred_forums[make] + +def get_preferred_forums(make): + if make not in preferred_forums: + return find_preferred_forums(make) + return preferred_forums[make] + +def parse_page(url): + content = extractor.get_content_from_url(url) + return content + + +def search_on_forum(forum, query, max_results: int = 5): + params = { + "q": query + f" {forum}", + "location": "Austin, Texas, United States", + "hl": "en", + "gl": "us", + "google_domain": "google.com", + "api_key": "KEY" + } + #search = GoogleSearch(params) + #results = search.get_dict() + + results = res + if results["search_metadata"]['status'] == "Success": + data = [] + for idx, result in enumerate(results["organic_results"]): + if idx >= max_results: + break + new_dict = { + "title": result["title"], + "link": result["link"], + "full-text": "" + } + try: + resp = requests.get(result["link"], headers={"User-Agent": ua.random}) + new_dict["full-text"] = extractor.get_content(resp.text) + except Exception as e: + print(f"Error parsing page {result['link']}: {e}") + data.append(new_dict) + return data + else: + return [] \ No newline at end of file -- cgit v1.2.3