mirror of
https://git.lolcat.ca/lolcat/4get.git
synced 2024-12-17 23:56:16 -05:00
duckduckgo scraper rewrite
This commit is contained in:
parent
0b3bbe0f15
commit
774f7113df
6 changed files with 1488 additions and 2234 deletions
19
api.txt
19
api.txt
|
@ -1,9 +1,16 @@
|
||||||
__ __ __
|
44
|
||||||
/ // / ____ ____ / /_
|
4444444 44
|
||||||
/ // /_/ __ `/ _ \/ __/
|
44444444 44444 444
|
||||||
/__ __/ /_/ / __/ /_
|
44444444 444444 444444444
|
||||||
/_/ \__, /\___/\__/
|
44444 44444444 444444444
|
||||||
/____/
|
444444444 4444444
|
||||||
|
4444444444 444444
|
||||||
|
4444444444444
|
||||||
|
444444444444444444
|
||||||
|
444444444444444
|
||||||
|
44444444
|
||||||
|
4444
|
||||||
|
44
|
||||||
|
|
||||||
+ Welcome to the 4get API documentation +
|
+ Welcome to the 4get API documentation +
|
||||||
|
|
||||||
|
|
|
@ -119,7 +119,7 @@ class config{
|
||||||
|
|
||||||
// Default user agent to use for scraper requests. Sometimes ignored to get specific webpages
|
// Default user agent to use for scraper requests. Sometimes ignored to get specific webpages
|
||||||
// Changing this might break things.
|
// Changing this might break things.
|
||||||
const USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0";
|
const USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0";
|
||||||
|
|
||||||
// Proxy pool assignments for each scraper
|
// Proxy pool assignments for each scraper
|
||||||
// false = Use server's raw IP
|
// false = Use server's raw IP
|
||||||
|
|
|
@ -75,6 +75,7 @@ class backend{
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case "socks5_hostname":
|
case "socks5_hostname":
|
||||||
|
case "socks5h":
|
||||||
case "socks5a":
|
case "socks5a":
|
||||||
curl_setopt($curlproc, CURLOPT_PROXYTYPE, CURLPROXY_SOCKS5_HOSTNAME);
|
curl_setopt($curlproc, CURLOPT_PROXYTYPE, CURLPROXY_SOCKS5_HOSTNAME);
|
||||||
curl_setopt($curlproc, CURLOPT_PROXY, $address . ":" . $port);
|
curl_setopt($curlproc, CURLOPT_PROXY, $address . ":" . $port);
|
||||||
|
|
|
@ -526,4 +526,85 @@ class fuckhtml{
|
||||||
$string
|
$string
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function extract_json($json){
|
||||||
|
|
||||||
|
$len = strlen($json);
|
||||||
|
$array_level = 0;
|
||||||
|
$object_level = 0;
|
||||||
|
$in_quote = null;
|
||||||
|
$start = null;
|
||||||
|
|
||||||
|
for($i=0; $i<$len; $i++){
|
||||||
|
|
||||||
|
switch($json[$i]){
|
||||||
|
|
||||||
|
case "[":
|
||||||
|
if($in_quote === null){
|
||||||
|
|
||||||
|
$array_level++;
|
||||||
|
if($start === null){
|
||||||
|
|
||||||
|
$start = $i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case "]":
|
||||||
|
if($in_quote === null){
|
||||||
|
|
||||||
|
$array_level--;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case "{":
|
||||||
|
if($in_quote === null){
|
||||||
|
|
||||||
|
$object_level++;
|
||||||
|
if($start === null){
|
||||||
|
|
||||||
|
$start = $i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case "}":
|
||||||
|
if($in_quote === null){
|
||||||
|
|
||||||
|
$object_level--;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case "\"":
|
||||||
|
case "'":
|
||||||
|
if(
|
||||||
|
$i !== 0 &&
|
||||||
|
$json[$i - 1] !== "\\"
|
||||||
|
){
|
||||||
|
// found a non-escaped quote
|
||||||
|
|
||||||
|
if($in_quote === null){
|
||||||
|
|
||||||
|
// open quote
|
||||||
|
$in_quote = $json[$i];
|
||||||
|
}elseif($in_quote === $json[$i]){
|
||||||
|
|
||||||
|
// close quote
|
||||||
|
$in_quote = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(
|
||||||
|
$start !== null &&
|
||||||
|
$array_level === 0 &&
|
||||||
|
$object_level === 0
|
||||||
|
){
|
||||||
|
|
||||||
|
return substr($json, $start, $i - $start + 1);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
3548
scraper/ddg.php
3548
scraper/ddg.php
File diff suppressed because it is too large
Load diff
1
scraper/out.txt
Normal file
1
scraper/out.txt
Normal file
File diff suppressed because one or more lines are too long
Loading…
Reference in a new issue