fixed mwmbl, results are slightly better but wtf did they do to the sublinks my gawd

2025-01-22 00:12:32 -05:00 · 2024-08-08 03:29:29 -04:00 · 2024-08-08 03:29:29 -04:00 · fbac3eeb8d
commit fbac3eeb8d
parent 36993013e5
2 changed files with 75 additions and 7 deletions
--- a/docs/configure.md
+++ b/docs/configure.md
@ -8,10 +8,10 @@ Welcome! This guide assumes that you have a working 4get instance. This will hel
 3. The captcha imagesets are located in `data/captcha/your_image_set/*.png`
 4. The captcha font is located in `data/fonts/captcha.ttf`

-# Cloudflare bypass
+# Cloudflare bypass (TLS check)
 **Note: this only allows you to bypass the browser integrity checks. Captchas & javascript challenges will not be bypassed.**

-Configuring this lets you fetch images sitting behind Cloudflare and allows you to scrape the **Yep** search engine. Following these instructions might make your package manager unhappy.
+Configuring this lets you fetch images sitting behind Cloudflare and allows you to scrape the **Yep** & the **Mwmbl** search engines. Please be aware that APT will fight against you and will re-install the openSSL-version of curl constantly when updating.

 First, follow these instructions. Only install the Firefox modules:

--- a/scraper/mwmbl.php
+++ b/scraper/mwmbl.php
@ -27,18 +27,24 @@ class mwmbl{
 		
 		curl_setopt($curlproc, CURLOPT_URL, $url);
 		
+		// use http2
+		curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
+		
 		curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
 		curl_setopt($curlproc, CURLOPT_HTTPHEADER,
 			["User-Agent: " . config::USER_AGENT,
 			"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
 			"Accept-Language: en-US,en;q=0.5",
 			"Accept-Encoding: gzip",
+			"Referer: https://beta.mwmbl.org/",
 			"DNT: 1",
+			"Sec-GPC: 1",
 			"Connection: keep-alive",
 			"Upgrade-Insecure-Requests: 1",
 			"Sec-Fetch-Dest: document",
 			"Sec-Fetch-Mode: navigate",
-			"Sec-Fetch-Site: none",
+			"Sec-Fetch-Site: same-origin",
+			"Priority: u=0, i",
 			"Sec-Fetch-User: ?1"]
 		);
 		
@ -46,7 +52,7 @@ class mwmbl{
 		curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
 		curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
 		curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
-		curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
+		curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); // @todo reset
 		
 		$this->backend->assign_proxy($curlproc, $proxy);
 		
@ -72,14 +78,14 @@ class mwmbl{
 		try{
 			$html = $this->get(
 				$this->backend->get_ip(), // no next page!
-				"https://mwmbl.org/app/home/",
+				"https://beta.mwmbl.org/",
 				[
 					"q" => $search
 				]
 			);
 		}catch(Exception $error){
 			
-			throw new Exception("Failed to fetch HTML");
+			throw new Exception("Failed to fetch HTML. If you're getting a timeout, make sure you have curl-impersonate setup.");
 		}
 		
 		$out = [
@ -115,6 +121,68 @@ class mwmbl{
 				$this->fuckhtml
 				->getElementsByTagName("p");
 			
+			$sublinks = [];
+			
+			$mores =
+				$this->fuckhtml
+				->getElementsByClassName(
+					"result-link-more",
+					"div"
+				);
+			
+			foreach($mores as $more){
+				
+				$this->fuckhtml->load($more);
+				
+				$as =
+					$this->fuckhtml
+					->getElementsByClassName(
+						"more",
+						"a"
+					);
+				
+				if(count($as) === 0){
+					
+					// ?? invalid
+					continue;
+				}
+				
+				$sublinks[] = [
+					"title" =>
+						$this->titledots(
+							$this->fuckhtml
+							->getTextContent(
+								$this->fuckhtml
+								->getElementsByClassName(
+									"more-title",
+									"span"
+								)[0]
+							)
+						),
+					"description" =>
+						$this->titledots(
+							$this->fuckhtml
+							->getTextContent(
+								$this->fuckhtml
+								->getElementsByClassName(
+									"more-extract",
+									"span"
+								)[0]
+							)
+						),
+					"url" =>
+						$this->fuckhtml
+						->getTextContent(
+							$as[0]
+							["attributes"]
+							["href"]
+						)
+				];
+			}
+			
+			// reset
+			$this->fuckhtml->load($result);
+			
 			$out["web"][] = [
 				"title" =>
 					$this->titledots(
@ -153,7 +221,7 @@ class mwmbl{
 					"url" => null,
 					"ratio" => null
 				],
-				"sublink" => [],
+				"sublink" => $sublinks,
 				"table" => []
 			];
 		}