1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
|
from serpapi import GoogleSearch
from .sample_res import res
from boilerpy3 import extractors
from fake_useragent import UserAgent
import requests
extractor = extractors.ArticleExtractor()
preferred_forums = {
"BMW": ["bimmerforums.com"]
}
ua = UserAgent()
"""
Website data:
[
{
"title":"",
"link": "",
"date": "", # prioritise older posts for older cars?,
"full-text": "",
},
]
"""
def find_preferred_forums(make):
if make not in preferred_forums:
return None
return preferred_forums[make]
def get_preferred_forums(make):
if make not in preferred_forums:
return find_preferred_forums(make)
return preferred_forums[make]
def parse_page(url):
content = extractor.get_content_from_url(url)
return content
def search_on_forum(forum, query, max_results: int = 5):
params = {
"q": query + f" {forum}",
"location": "Austin, Texas, United States",
"hl": "en",
"gl": "us",
"google_domain": "google.com",
"api_key": "KEY"
}
#search = GoogleSearch(params)
#results = search.get_dict()
results = res
if results["search_metadata"]['status'] == "Success":
data = []
for idx, result in enumerate(results["organic_results"]):
if idx >= max_results:
break
new_dict = {
"title": result["title"],
"link": result["link"],
"full-text": ""
}
try:
resp = requests.get(result["link"], headers={"User-Agent": ua.random})
new_dict["full-text"] = extractor.get_content(resp.text)
except Exception as e:
print(f"Error parsing page {result['link']}: {e}")
data.append(new_dict)
return data
else:
return []
|