diff options
author | Navan Chauhan <navanchauhan@gmail.com> | 2023-10-07 22:47:27 -0600 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-10-07 22:47:27 -0600 |
commit | cc6fc72154f1b2d70cde89381651877526eef78a (patch) | |
tree | 3ff90cfc688eb21ab87c82be9f297b85458305be /autoAidModules/search_funcs.py | |
parent | 87d23597f9008c39e18ae679ca5aa5fbe6174ae9 (diff) | |
parent | 61a48906f26d7074f7ad64f221d3948cb84f21ea (diff) |
Merge pull request #2 from navanchauhan/devel
py module
Diffstat (limited to 'autoAidModules/search_funcs.py')
-rw-r--r-- | autoAidModules/search_funcs.py | 76 |
1 files changed, 76 insertions, 0 deletions
diff --git a/autoAidModules/search_funcs.py b/autoAidModules/search_funcs.py new file mode 100644 index 0000000..241872e --- /dev/null +++ b/autoAidModules/search_funcs.py @@ -0,0 +1,76 @@ +from serpapi import GoogleSearch +from .sample_res import res +from boilerpy3 import extractors +from fake_useragent import UserAgent + +import requests + +extractor = extractors.ArticleExtractor() + +preferred_forums = { + "BMW": ["bimmerforums.com"] +} + +ua = UserAgent() + +""" +Website data: +[ + +{ + "title":"", + "link": "", + "date": "", # prioritise older posts for older cars?, + "full-text": "", +}, + +] +""" + +def find_preferred_forums(make): + if make not in preferred_forums: + return None + return preferred_forums[make] + +def get_preferred_forums(make): + if make not in preferred_forums: + return find_preferred_forums(make) + return preferred_forums[make] + +def parse_page(url): + content = extractor.get_content_from_url(url) + return content + + +def search_on_forum(forum, query, max_results: int = 5): + params = { + "q": query + f" {forum}", + "location": "Austin, Texas, United States", + "hl": "en", + "gl": "us", + "google_domain": "google.com", + "api_key": "KEY" + } + #search = GoogleSearch(params) + #results = search.get_dict() + + results = res + if results["search_metadata"]['status'] == "Success": + data = [] + for idx, result in enumerate(results["organic_results"]): + if idx >= max_results: + break + new_dict = { + "title": result["title"], + "link": result["link"], + "full-text": "" + } + try: + resp = requests.get(result["link"], headers={"User-Agent": ua.random}) + new_dict["full-text"] = extractor.get_content(resp.text) + except Exception as e: + print(f"Error parsing page {result['link']}: {e}") + data.append(new_dict) + return data + else: + return []
\ No newline at end of file |