1
Fork 1
mirror of https://git.lolcat.ca/lolcat/4get.git synced 2024-11-08 17:43:07 -05:00

Compare commits

..

7 commits

Author SHA1 Message Date
lolcat
b98a393421 Merge pull request 'Clean .gitignore and ignore icons/*' (#25) from libsys/4get:master into master
Reviewed-on: https://git.lolcat.ca/lolcat/4get/pulls/25
2024-06-27 02:59:08 +00:00
Felix Freeman
cb4d933467 Ignore files in icons/ 2024-06-26 19:09:58 -04:00
Felix Freeman
d509effb30 Clean .gitignore 2024-06-26 09:29:46 -04:00
lolcat
77931f3ee9 brave error handling 2024-06-25 18:05:43 -04:00
lolcat
640d1d1953 fixed nasty brave bug 2024-06-24 17:15:54 -04:00
lolcat
75003b6617 Merge branch 'master' of https://git.lolcat.ca/lolcat/4get 2024-06-22 14:00:12 -04:00
lolcat
4e1df70ce6 getTextContent google error piece of shit 2024-06-22 13:59:10 -04:00
6 changed files with 255 additions and 47 deletions

6
.gitignore vendored
View file

@ -1,4 +1,3 @@
<<<<<<< HEAD
lib/test.html lib/test.html
lib/postdata.json lib/postdata.json
lib/nextpage.json lib/nextpage.json
@ -25,8 +24,5 @@ data/captcha/birds/
data/captcha/fumo_plushies/ data/captcha/fumo_plushies/
data/captcha/minecraft/ data/captcha/minecraft/
!banner/*default* !banner/*default*
=======
banner/*
!banner/*default*
>>>>>>> 77293818cd213ec0ad07c573d298fff9cd5b357d
scraper/curlie.html scraper/curlie.html
icons/*

View file

@ -75,6 +75,7 @@ class backend{
break; break;
case "socks5_hostname": case "socks5_hostname":
case "socks5a":
curl_setopt($curlproc, CURLOPT_PROXYTYPE, CURLPROXY_SOCKS5_HOSTNAME); curl_setopt($curlproc, CURLOPT_PROXYTYPE, CURLPROXY_SOCKS5_HOSTNAME);
curl_setopt($curlproc, CURLOPT_PROXY, $address . ":" . $port); curl_setopt($curlproc, CURLOPT_PROXY, $address . ":" . $port);
break; break;

View file

@ -424,8 +424,6 @@ class fuckhtml{
$escape = false; $escape = false;
} }
$lastchar = $json[$i];
if( if(
$bracket === false && $bracket === false &&
$is_close_bracket === false $is_close_bracket === false
@ -436,7 +434,6 @@ class fuckhtml{
case "[": case "[":
case "{": case "{":
// dont execute whats in "default"
$json_out .= $json[$i]; $json_out .= $json[$i];
break; break;

View file

@ -295,6 +295,7 @@ class brave{
$html = fread($handle, filesize("scraper/brave.html")); $html = fread($handle, filesize("scraper/brave.html"));
fclose($handle); fclose($handle);
*/ */
try{ try{
$html = $html =
$this->get( $this->get(
@ -406,7 +407,7 @@ class brave{
if(!isset($grep[1])){ if(!isset($grep[1])){
throw new Exception("Could not get data JS"); throw new Exception("Could not grep JavaScript object");
} }
$data = $data =
@ -416,6 +417,24 @@ class brave{
); );
unset($grep); unset($grep);
if($data === null){
throw new Exception("Failed to decode JavaScript object");
}
if(
isset($data[2]["data"]["title"]) &&
stripos($data[2]["data"]["title"], "PoW Captcha") !== false
){
throw new Exception("Brave returned a PoW captcha");
}
if(!isset($data[1]["data"]["body"]["response"])){
throw new Exception("Brave did not return a result object");
}
$data = $data[1]["data"]["body"]["response"]; $data = $data[1]["data"]["body"]["response"];
/* /*

View file

@ -2591,6 +2591,12 @@ class google{
foreach($relateds as $related){ foreach($relateds as $related){
if(!isset($related["innerHTML"])){
// found an image
continue;
}
$text = $text =
$this->fuckhtml $this->fuckhtml
->getTextContent( ->getTextContent(
@ -3192,41 +3198,52 @@ class google{
$this->fuckhtml->load($header[0]); $this->fuckhtml->load($header[0]);
$title_tag = // g-snackbar-action present: we found a button instead
$this->fuckhtml if(
->getElementsByAttributeValue( count(
"data-attrid",
"title",
"div"
);
if(count($title_tag) !== 0){
$title =
$this->fuckhtml $this->fuckhtml
->getTextContent( ->getElementsByTagName(
$title_tag[0] "g-snackbar-action"
); )
) !== 0
){
$header[0]["innerHTML"] = $title_tag =
str_replace(
$title_tag[0]["outerHTML"],
"",
$header[0]["innerHTML"]
);
// if header still contains text, add it as a subtitle in description
$subtitle =
$this->fuckhtml $this->fuckhtml
->getTextContent( ->getElementsByAttributeValue(
$header[0] "data-attrid",
"title",
"div"
); );
if(strlen($subtitle) !== 0){ if(count($title_tag) !== 0){
$title =
$this->fuckhtml
->getTextContent(
$title_tag[0]
);
$description[] = [ $header[0]["innerHTML"] =
"type" => "quote", str_replace(
"value" => $subtitle $title_tag[0]["outerHTML"],
]; "",
$header[0]["innerHTML"]
);
// if header still contains text, add it as a subtitle in description
$subtitle =
$this->fuckhtml
->getTextContent(
$header[0]
);
if(strlen($subtitle) !== 0){
$description[] = [
"type" => "quote",
"value" => $subtitle
];
}
} }
} }
@ -3386,9 +3403,117 @@ class google{
$this->fuckhtml->load($rhs); $this->fuckhtml->load($rhs);
} }
// abort if we didnt find any description // initialize sublinks
$sublinks = [];
// get description from business
if(count($description) === 0){ if(count($description) === 0){
$data_attrid =
$this->fuckhtml
->getElementsByAttributeName(
"data-attrid"
);
$summary =
$this->fuckhtml
->getElementsByAttributeValue(
"data-attrid",
"kc:/local:one line summary",
$data_attrid
);
if(count($summary) !== 0){
$description[] = [
"type" => "quote",
"value" =>
$this->fuckhtml
->getTextContent(
$summary[0]
)
];
// remove summary so it doesnt get parsed as a table
$rhs["innerHTML"] =
str_replace(
$summary[0]["outerHTML"],
"",
$rhs["innerHTML"]
);
$this->fuckhtml->load($rhs);
}
$address =
$this->fuckhtml
->getElementsByAttributeValue(
"data-attrid",
"kc:/location/location:address",
$data_attrid
);
if(count($address) !== 0){
$description[] = [
"type" => "text",
"value" =>
$this->fuckhtml
->getTextContent(
$address[0]
)
];
}
// get title
$title_div =
$this->fuckhtml
->getElementsByAttributeValue(
"data-attrid",
"title",
$data_attrid
);
if(count($title_div) !== 0){
$title =
$this->fuckhtml
->getTextContent(
$title_div[0]
);
}
// get phone number
$phone =
$this->fuckhtml
->getElementsByAttributeValue(
"data-attrid",
"kc:/local:alt phone",
$data_attrid
);
if(count($phone) !== 0){
$this->fuckhtml->load($phone[0]);
$sublinks["Call"] =
"tel:" .
$this->fuckhtml
->getTextContent(
$this->fuckhtml
->getElementsByAttributeName(
"aria-label",
"span"
)[0]
);
$this->fuckhtml->load($rhs);
}
}
if(count($description) === 0){
// still no description? abort
return $out; return $out;
} }
@ -3437,7 +3562,55 @@ class google{
": " ": "
); );
if($key == ""){ if(
$key == "" ||
$key == "Phone"
){
continue;
}
if($key == "Hours"){
$hours = [];
$this->fuckhtml->load($elem);
$trs =
$this->fuckhtml
->getElementsByTagName(
"tr"
);
foreach($trs as $tr){
$this->fuckhtml->load($tr);
$tds =
$this->fuckhtml
->getElementsByTagName(
"td"
);
if(count($tds) === 2){
$hours[] =
$this->fuckhtml
->getTextContent(
$tds[0]
) . ": " .
$this->fuckhtml
->getTextContent(
$tds[1]
);
}
}
if(count($hours) !== 0){
$hours = implode("\n", $hours);
$table["Hours"] = $hours;
}
continue; continue;
} }
@ -3451,14 +3624,10 @@ class google{
$elem $elem
) )
); );
// reset
$this->fuckhtml->load($rhs);
} }
// reset
// get sublink elements $this->fuckhtml->load($rhs);
$sublinks = [];
// get the website div // get the website div
$as = $as =
@ -3482,6 +3651,28 @@ class google{
["href"] ["href"]
) )
); );
}else{
// get website through button
$button =
$this->fuckhtml
->getElementsByClassName(
"ab_button",
"a"
);
if(count($button) !== 0){
$sublinks["Website"] =
$this->unshiturl(
$this->fuckhtml
->getTextContent(
$button[0]
["attributes"]
["href"]
)
);
}
} }
// get social media links // get social media links

View file

@ -794,6 +794,10 @@ table tr a:last-child{
background:var(--1d2021); background:var(--1d2021);
} }
.web .wiki-head table td{
white-space:pre-line;
}
.web .wiki-head td, .about table td{ .web .wiki-head td, .about table td{
padding:4px 7px; padding:4px 7px;
vertical-align:middle; vertical-align:middle;