0
Fork 0
mirror of https://git.lolcat.ca/lolcat/4get.git synced 2025-01-01 00:03:55 -05:00

fag protection

This commit is contained in:
lolcat 2024-03-24 22:31:19 -04:00
parent 81502d4721
commit 2976c0a6a4
11 changed files with 82 additions and 40 deletions

View file

@ -23,17 +23,13 @@ class config{
// Enable the API?
const API_ENABLED = true;
// Bot protection
// 4get.ca has been hit with 500k bot reqs every single day for months
// you probably want to enable this if your instance is public...
// 0 = disabled
// 1 = ask for image captcha (requires imagemagick v6 or higher)
// @TODO: 2 = invite only (users needs a pass)
const BOT_PROTECTION = 0;
//
// BOT PROTECTION
//
// Maximal number of searches per captcha key/pass issued. Counter gets
// reset on every APCU cache clear (should happen once a day)
const MAX_SEARCHES = 100;
// 0 = disabled, 1 = ask for image captcha, @TODO: 2 = invite only (users needs a pass)
// VERY useful against a targetted attack
const BOT_PROTECTION = 0;
// if BOT_PROTECTION is set to 1, specify the available datasets here
// images should be named from 1.png to X.png, and be 100x100 in size
@ -45,6 +41,32 @@ class config{
// ["minecraft", 848]
];
// If this regex expression matches on the user agent, it blocks the request
// Not useful at all against a targetted attack
const HEADER_REGEX = '/bot|wget|curl|python-requests|scrapy|go-http-client|ruby|yahoo|spider/i';
// Block clients who present any of the following headers in their request (SPECIFY IN !!lowercase!!)
// Eg: ["x-forwarded-for", "x-via", "forwarded-for", "via"];
// Useful for blocking *some* proxies used for botting
const FILTERED_HEADER_KEYS = [
"x-forwarded-for",
"x-via",
"forwarded-for",
"via"
];
// @TODO: Portscan the user for open proxies before allowing a connection, block user if any are found
// Requires the nmap package
const NMAP_PROXY_CHECK = false;
// @TODO: Make IP blacklist public under /api/v1/blacklist endpoint ?
const PUBLIC_IP_BLACKLIST = true;
// Maximal number of searches per captcha key/pass issued. Counter gets
// reset on every APCU cache clear (should happen once a day).
// Only useful when BOT_PROTECTION is NOT set to 0
const MAX_SEARCHES = 100;
// List of domains that point to your servers. Include your tor/i2p
// addresses here! Must be a valid URL. Won't affect links placed on
// the homepage.

View file

@ -8,6 +8,9 @@ Welcome! This guide assumes that you have a working 4get instance. This will hel
3. The captcha imagesets are located in `data/captcha/your_image_set/*.png`
4. The captcha font is located in `data/fonts/captcha.ttf`
## Robots.txt
Make sure you configure this right to optimize your search engine presence! Head over to `/robots.txt` and change the 4get.ca domain to your own domain.
## Server listing
To be listed on https://4get.ca/instances , you must contact *any* of the people in the server list and ask them to add you to their list of instances in their configuration. The instance list is distributed, and I don't have control over it.
@ -32,4 +35,4 @@ If you see spammy entries in your instances list, simply remove the instance fro
Done! The scraper you chose should now be using the rotating proxies. When asking for the next page of results, it will use the same proxy to avoid detection!
### Important!
If you ever test out a `socks5` proxy locally on your machine and find out it works but doesn't on your server, try supplying the `socks5_hostname` protocol instead.
If you ever test out a `socks5` proxy locally on your machine and find out it works but doesn't on your server, try supplying the `socks5_hostname` protocol instead. Hopefully this tip can save you 3 hours of your life!

View file

@ -29,7 +29,7 @@ try{
}catch(Exception $error){
$frontend->drawscrapererror($error->getMessage(), $get, "images");
$frontend->drawscrapererror($error->getMessage(), $get, "images", $payload["timetaken"]);
}
if(count($results["image"]) === 0){

View file

@ -32,6 +32,8 @@ class backend{
$proxylist = array_values($proxylist);
echo $proxy_index_raw % count($proxylist);
return $proxylist[$proxy_index_raw % count($proxylist)];
}

View file

@ -44,7 +44,7 @@ class frontend{
$replacements["timetaken"] !== null
){
$replacements["timetaken"] = '<div class="timetaken">Took ' . substr(microtime(true) - $replacements["timetaken"], 0, 4) . 's</div>';
$replacements["timetaken"] = '<div class="timetaken">Took ' . number_format(microtime(true) - $replacements["timetaken"], 2) . 's</div>';
}
$handle = fopen("template/{$template}", "r");
@ -84,29 +84,54 @@ class frontend{
"filters" => $this->generatehtmlfilters($filters, $get)
]);
$headers_raw = getallheaders();
$header_keys = [];
$user_agent = "";
$bad_header = false;
foreach($headers_raw as $headerkey => $headervalue){
$headerkey = strtolower($headerkey);
if($headerkey == "user-agent"){
$user_agent = $headervalue;
continue;
}
// check header key
if(in_array($headerkey, config::FILTERED_HEADER_KEYS)){
$bad_header = true;
break;
}
}
if(
preg_match(
'/bot|wget|curl|python-requests|scrapy|feedfetcher|go-http-client|ruby|universalfeedparser|yahoo\! slurp|spider|rss/i',
$_SERVER["HTTP_USER_AGENT"]
)
config::HEADER_REGEX,
$user_agent
) ||
$bad_header === true
){
// bot detected !!
apcu_inc("captcha_gen");
$null = null;
$this->drawerror(
"Tshh, blocked!",
'You were blocked from viewing this page. If you wish to scrape data from 4get, please consider running <a href="https://git.lolcat.ca/lolcat/4get" rel="noreferrer nofollow">your own 4get instance</a>.',
'Your browser, IP or IP range has been blocked from this 4get instance. If this is an error, please <a href="/about">contact the administrator</a>.',
microtime(true)
);
die();
}
}
public function drawerror($title, $error){
public function drawerror($title, $error, $timetaken){
echo
$this->load("search.html", [
"timetaken" => null,
"timetaken" => $timetaken,
"class" => "",
"right-left" => "",
"right-right" => "",
@ -119,7 +144,7 @@ class frontend{
die();
}
public function drawscrapererror($error, $get, $target){
public function drawscrapererror($error, $get, $target, $timetaken){
$this->drawerror(
"Shit",
@ -131,7 +156,8 @@ class frontend{
'<li>Remove keywords that could cause errors</li>' .
'<li><a href="/instances?target=' . $target . "&" . $this->buildquery($get, false) . '">Try your search on another 4get instance</a></li>' .
'</ul><br>' .
'If the error persists, please <a href="/about">contact the administrator</a>.'
'If the error persists, please <a href="/about">contact the administrator</a>.',
$timetaken
);
}
@ -483,10 +509,6 @@ class frontend{
$archives[] = "warosu.org";
break;
case "cm":
$archives[] = "boards.fireden.net";
break;
case "f":
$archives[] = "archive.4plebs.org";
break;
@ -503,12 +525,10 @@ class frontend{
break;
case "v":
$archives[] = "boards.fireden.net";
$archives[] = "arch.b4k.co";
break;
case "vg":
$archives[] = "boards.fireden.net";
$archives[] = "arch.b4k.co";
break;
@ -579,7 +599,6 @@ class frontend{
break;
case "sci":
$archives[] = "boards.fireden.net";
$archives[] = "warosu.org";
$archives[] = "eientei.xyz";
break;
@ -614,7 +633,6 @@ class frontend{
break;
case "ic":
$archives[] = "boards.fireden.net";
$archives[] = "warosu.org";
break;
@ -741,10 +759,6 @@ class frontend{
$archives[] = "desuarchive.org";
break;
case "y":
$archives[] = "boards.fireden.net";
break;
case "t":
$archives[] = "archiveofsins.com";
break;
@ -802,7 +816,7 @@ class frontend{
$payload .=
'<a href="https://webcache.googleusercontent.com/search?q=cache:' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://google.com" alt="go">Google cache</a>' .
'<a href="https://web.archive.org/web/' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://archive.org" alt="ar">Archive.org</a>' .
'<a href="https://archive.is/newest/' . htmlspecialchars($link) . '" class="list" target="_BLANK"><img src="/favicon?s=https://archive.is" alt="ar">Archive.is</a>' .
'<a href="https://archive.ph/newest/' . htmlspecialchars($link) . '" class="list" target="_BLANK"><img src="/favicon?s=https://archive.is" alt="ar">Archive.is</a>' .
'<a href="https://ghostarchive.org/search?term=' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://ghostarchive.org" alt="gh">Ghostarchive</a>' .
'<a href="https://www.bing.com/search?q=url%3A' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://bing.com" alt="bi">Bing cache</a>' .
'<a href="https://megalodon.jp/?url=' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://megalodon.jp" alt="me">Megalodon</a>' .

View file

@ -73,7 +73,7 @@ class fuckhtml{
$attributes = [];
preg_match_all(
'/([^\/\s\\=]+)(?:\s*=\s*("[^"]*"|\'[^\']*\'|[^\s]*))?/',
'/([^\/\s\\=]+)(?:\s*=\s*("[^"]*"|\'[^\']*\'|[^\s]*))?/i',
$starting_tags[2][$i][0],
$regex_attributes
);
@ -88,7 +88,7 @@ class fuckhtml{
continue;
}
$attributes[$regex_attributes[1][$k]] =
$attributes[strtolower($regex_attributes[1][$k])] =
trim($regex_attributes[2][$k], "'\" \n\r\t\v\x00");
}

View file

@ -31,7 +31,7 @@ try{
}catch(Exception $error){
$frontend->drawscrapererror($error->getMessage(), $get, "music");
$frontend->drawscrapererror($error->getMessage(), $get, "music", $payload["timetaken"]);
}
$categories = [

View file

@ -31,7 +31,7 @@ try{
}catch(Exception $error){
$frontend->drawscrapererror($error->getMessage(), $get, "news");
$frontend->drawscrapererror($error->getMessage(), $get, "news", $payload["timetaken"]);
}
/*

View file

@ -654,6 +654,7 @@ class google{
throw new Exception("Failed to get HTML");
}
//$html = file_get_contents("scraper/google.html");
}

View file

@ -31,7 +31,7 @@ try{
}catch(Exception $error){
$frontend->drawscrapererror($error->getMessage(), $get, "videos");
$frontend->drawscrapererror($error->getMessage(), $get, "videos", $payload["timetaken"]);
}
$categories = [

View file

@ -31,7 +31,7 @@ try{
}catch(Exception $error){
$frontend->drawscrapererror($error->getMessage(), $get, "web");
$frontend->drawscrapererror($error->getMessage(), $get, "web", $payload["timetaken"]);
}
/*