mirror of
https://codeberg.org/SafeTwitch/safetwitch.git
synced 2025-01-21 20:02:30 -05:00
Switch to much more lightweight and faster scraping method
This commit is contained in:
parent
3ecc4a05da
commit
7d680c47cd
5 changed files with 206 additions and 220 deletions
|
@ -1,12 +1,8 @@
|
|||
{
|
||||
"dependencies": {
|
||||
"@dragongoose/streamlink": "^1.1.1",
|
||||
"connect-history-api-fallback": "^2.0.0",
|
||||
"dotenv": "^16.0.3",
|
||||
"express": "^4.18.2",
|
||||
"puppeteer": "^19.7.2",
|
||||
"puppeteer-extra": "^3.3.6",
|
||||
"puppeteer-extra-plugin-adblocker": "^2.13.6",
|
||||
"winston": "^3.8.2",
|
||||
"ws": "^8.13.0"
|
||||
},
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
import { Router } from 'express'
|
||||
import { TwitchScraper } from '../util/scraping/extractors'
|
||||
import { TwitchAPI } from '../util/scraping/extractor/index'
|
||||
|
||||
const profileRouter = Router()
|
||||
const scraper = new TwitchScraper()
|
||||
const twitch = new TwitchAPI()
|
||||
|
||||
profileRouter.get('/users/:username', async (req, res, next) => {
|
||||
const username = req.params.username
|
||||
|
||||
let streamerData = await scraper.getStreamerData(username)
|
||||
let streamerData = await twitch.getStreamerInfo(username)
|
||||
.catch(next)
|
||||
|
||||
if (streamerData)
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
export interface Socials {
|
||||
export interface Social {
|
||||
type: string | null
|
||||
text: string,
|
||||
link: string
|
||||
|
@ -9,16 +9,20 @@ export interface StreamData {
|
|||
title: string
|
||||
topic: string
|
||||
startedAt: number
|
||||
qualities: string[]
|
||||
viewers: number
|
||||
preview: string
|
||||
}
|
||||
|
||||
export interface StreamerData {
|
||||
username: string,
|
||||
followers: number,
|
||||
followersAbbv: string,
|
||||
isLive: boolean,
|
||||
about: string,
|
||||
socials?: string[],
|
||||
pfp: string;
|
||||
stream?: StreamData
|
||||
username: string
|
||||
followers: number
|
||||
followersAbbv: string
|
||||
isLive: boolean
|
||||
about: string
|
||||
socials?: Social[]
|
||||
pfp: string
|
||||
stream: StreamData | null
|
||||
isPartner: boolean
|
||||
colorHex: string
|
||||
id: number
|
||||
}
|
189
server/util/scraping/extractor/index.ts
Normal file
189
server/util/scraping/extractor/index.ts
Normal file
|
@ -0,0 +1,189 @@
|
|||
import { LooseObject } from "../../../types/looseTypes"
|
||||
import { StreamerData, StreamData, Social } from "../../../types/scraping/Streamer"
|
||||
|
||||
/**
|
||||
* Class that interacts with the Twitch api
|
||||
*/
|
||||
export class TwitchAPI {
|
||||
public readonly twitchUrl = 'https://gql.twitch.tv/gql'
|
||||
public headers = {
|
||||
"Client-Id": "kimne78kx3ncx6brgo4mv6wki5h1ko"
|
||||
}
|
||||
|
||||
constructor() {}
|
||||
|
||||
/**
|
||||
* Gets information about a streamer, like socials, about, and more.
|
||||
* @see StreamerData
|
||||
* @param streamerName The username of the streamer
|
||||
* @returns Promise<StreamerData>
|
||||
*/
|
||||
public getStreamerInfo = async (streamerName: string) => {
|
||||
const payload = [
|
||||
{
|
||||
"operationName": "ChannelRoot_AboutPanel",
|
||||
"variables": {
|
||||
"channelLogin": streamerName,
|
||||
"skipSchedule": false
|
||||
},
|
||||
"extensions": {
|
||||
"persistedQuery": {
|
||||
"version": 1,
|
||||
"sha256Hash": "6089531acef6c09ece01b440c41978f4c8dc60cb4fa0124c9a9d3f896709b6c6"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"operationName":"StreamMetadata",
|
||||
"variables":{
|
||||
"channelLogin": streamerName
|
||||
},
|
||||
"extensions":{
|
||||
"persistedQuery":{
|
||||
"version":1,
|
||||
"sha256Hash":"a647c2a13599e5991e175155f798ca7f1ecddde73f7f341f39009c14dbf59962"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"operationName": "StreamTagsTrackingChannel",
|
||||
"variables": {
|
||||
"channel": streamerName
|
||||
},
|
||||
"extensions": {
|
||||
"persistedQuery": {
|
||||
"version": 1,
|
||||
"sha256Hash": "6aa3851aaaf88c320d514eb173563d430b28ed70fdaaf7eeef6ed4b812f48608"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"operationName": "VideoPreviewOverlay",
|
||||
"variables": {
|
||||
"login": streamerName
|
||||
},
|
||||
"extensions": {
|
||||
"persistedQuery": {
|
||||
"version": 1,
|
||||
"sha256Hash": "9515480dee68a77e667cb19de634739d33f243572b007e98e67184b1a5d8369f"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"operationName": "UseViewCount",
|
||||
"variables": {
|
||||
"channelLogin": streamerName
|
||||
},
|
||||
"extensions": {
|
||||
"persistedQuery": {
|
||||
"version": 1,
|
||||
"sha256Hash": "00b11c9c428f79ae228f30080a06ffd8226a1f068d6f52fbc057cbde66e994c2"
|
||||
}
|
||||
}
|
||||
},
|
||||
]
|
||||
|
||||
const res = await fetch(this.twitchUrl, {
|
||||
method: 'POST',
|
||||
body: JSON.stringify(payload),
|
||||
headers: this.headers
|
||||
})
|
||||
|
||||
const data = await res.json()
|
||||
console.log(data[1].data, data[1].data.user.stream)
|
||||
|
||||
|
||||
const rawStreamerData = data[0].data
|
||||
|
||||
|
||||
// get socials
|
||||
const socials: LooseObject[] = []
|
||||
if (rawStreamerData.user.channel && rawStreamerData.user.channel.socialMedias) {
|
||||
for (let social of rawStreamerData.user.channel.socialMedias) {
|
||||
socials.push({
|
||||
type: social.name,
|
||||
name: social.title,
|
||||
link: social.url
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// check if is liver
|
||||
const rawStreamData = data[1].data.user.stream
|
||||
let parsedStream: StreamData | null;
|
||||
if(!rawStreamData) {
|
||||
parsedStream = null
|
||||
} else {
|
||||
const tags: string[] = []
|
||||
for (let tagData of data[2].data.user.stream.freeformTags) {
|
||||
tags.push(tagData.name)
|
||||
}
|
||||
|
||||
parsedStream = {
|
||||
title: data[1].data.user.lastBroadcast.title,
|
||||
topic: rawStreamData.game.name,
|
||||
startedAt: new Date(rawStreamData.createdAt).valueOf(),
|
||||
tags,
|
||||
viewers: Number(data[4].data.user.stream.viewersCount),
|
||||
preview: data[3].data.user.stream.previewImageURL
|
||||
}
|
||||
}
|
||||
|
||||
const abbreviatedFollowers = Intl.NumberFormat('en-US', {
|
||||
notation: "compact",
|
||||
maximumFractionDigits: 1
|
||||
}).format(rawStreamerData.user.followers.totalCount)
|
||||
|
||||
const streamerData: StreamerData = {
|
||||
username: rawStreamerData.user.displayName,
|
||||
about: rawStreamerData.user.description,
|
||||
pfp: rawStreamerData.user.profileImageURL,
|
||||
followers: rawStreamerData.user.followers.totalCount,
|
||||
socials: socials as Social[],
|
||||
isLive: (!!parsedStream),
|
||||
isPartner: rawStreamerData.user.isPartner,
|
||||
followersAbbv: abbreviatedFollowers,
|
||||
colorHex: '#' + rawStreamerData.user.primaryColorHex,
|
||||
id: Number(rawStreamerData.user.id),
|
||||
stream: parsedStream
|
||||
}
|
||||
|
||||
return Promise.resolve(streamerData)
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the current viewers of a stream
|
||||
* @param streamerName The username of the streamer
|
||||
* @returns Promise<number>
|
||||
*/
|
||||
public getViewers = async (streamerName: string) => {
|
||||
const payload = [
|
||||
{
|
||||
"operationName": "UseViewCount",
|
||||
"variables": {
|
||||
"channelLogin": streamerName
|
||||
},
|
||||
"extensions": {
|
||||
"persistedQuery": {
|
||||
"version": 1,
|
||||
"sha256Hash": "00b11c9c428f79ae228f30080a06ffd8226a1f068d6f52fbc057cbde66e994c2"
|
||||
}
|
||||
}
|
||||
},
|
||||
]
|
||||
|
||||
const res = await fetch(this.twitchUrl, {
|
||||
method: 'POST',
|
||||
body: JSON.stringify(payload),
|
||||
headers: this.headers
|
||||
})
|
||||
|
||||
const rawData = await res.json()
|
||||
console.log(rawData)
|
||||
|
||||
if(!rawData[0].data.user.stream)
|
||||
return Promise.reject(new Error(`Streamer ${streamerName} is not live`))
|
||||
|
||||
return Promise.resolve(rawData[0].data.user.stream.viewersCount)
|
||||
}
|
||||
}
|
|
@ -1,203 +0,0 @@
|
|||
import puppeteer from 'puppeteer-extra'
|
||||
import { Browser, Page } from 'puppeteer'
|
||||
import { PuppeteerExtraPluginAdblocker } from 'puppeteer-extra-plugin-adblocker'
|
||||
import { LooseObject } from '../../types/looseTypes'
|
||||
import { StreamData, StreamerData, Socials } from '../../types/scraping/Streamer'
|
||||
import { Streamlink } from '@dragongoose/streamlink'
|
||||
|
||||
|
||||
export class TwitchScraper {
|
||||
public cache: Map<string, StreamerData> = new Map()
|
||||
|
||||
|
||||
constructor() {
|
||||
puppeteer.use(new PuppeteerExtraPluginAdblocker({
|
||||
blockTrackersAndAnnoyances: true
|
||||
}))
|
||||
}
|
||||
|
||||
private abbreviatedNumberToNumber = (num: string) => {
|
||||
const base = parseFloat(num)
|
||||
|
||||
const matches: {[k: string]: number} = {
|
||||
'k': 1000,
|
||||
'm': 1000000,
|
||||
'b': 1000000000
|
||||
}
|
||||
|
||||
const abbreviation: string = num.charAt(num.length - 1).toLowerCase()
|
||||
|
||||
|
||||
if(matches[abbreviation]) {
|
||||
const numberOnly: number = Number(num.slice(0, -1))
|
||||
return numberOnly * matches[abbreviation]
|
||||
} else {
|
||||
return null
|
||||
}
|
||||
}
|
||||
|
||||
// https:// advancedweb.hu/how-to-speed-up-puppeteer-scraping-with-parallelization/
|
||||
private withBrowser = async (fn: Function) => {
|
||||
const browser = await puppeteer.launch({
|
||||
headless: true,
|
||||
args: ['--no-sandbox']
|
||||
});
|
||||
try {
|
||||
return await fn(browser);
|
||||
} finally {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
|
||||
private withPage = (browser: Browser) => async (fn: Function) => {
|
||||
const page = await browser.newPage();
|
||||
//await page.tracing.start({ path: '../profile.json', screenshots: true });
|
||||
try {
|
||||
return await fn(page);
|
||||
} finally {
|
||||
//await page.tracing.stop();
|
||||
await page.close();
|
||||
}
|
||||
}
|
||||
|
||||
private getStreamData = async (page: Page, isLive: boolean) => {
|
||||
const streamData: LooseObject = {}
|
||||
|
||||
if(!isLive) return null
|
||||
|
||||
// Get stream tags
|
||||
const tagsSelector = '.eUxEWt * span'
|
||||
const tags: string[] = await page.$$eval(tagsSelector, elements => elements.map(el => el.innerHTML))
|
||||
streamData.tags = tags
|
||||
|
||||
// Get stream title
|
||||
const titleSelector = 'h2.CoreText-sc-1txzju1-0'
|
||||
const title: string = await page.$eval(titleSelector, element => element.innerText)
|
||||
streamData.title = title
|
||||
|
||||
// Get topic
|
||||
const topicSelector = '.hfMGmo'
|
||||
const topic = await page.$eval(topicSelector, element => element.textContent)
|
||||
streamData.topic = topic
|
||||
|
||||
// Get Start time
|
||||
const liveTimeSelector = '.live-time'
|
||||
|
||||
// formated as HH:MM:SS
|
||||
const liveTime = await page.$eval(liveTimeSelector, element => element.textContent)
|
||||
if(!liveTime) return
|
||||
const liveTimeSplit: number[] = liveTime.split(':').map(Number)
|
||||
let date = new Date()
|
||||
let { hours, minutes, seconds } = { hours: date.getHours(), minutes: date.getMinutes(), seconds: date.getSeconds()}
|
||||
|
||||
// Subtracts current live time from current
|
||||
// date to get the time the stream started
|
||||
date.setHours(hours - liveTimeSplit[0])
|
||||
date.setMinutes(minutes - liveTimeSplit[1])
|
||||
date.setSeconds(seconds - liveTimeSplit[2])
|
||||
|
||||
streamData.startedAt = date.getTime()
|
||||
|
||||
return streamData as StreamData
|
||||
}
|
||||
|
||||
private getAboutData = async (page: Page, isLive: boolean) => {
|
||||
const aboutData: LooseObject = {}
|
||||
|
||||
if (!isLive) {
|
||||
// Get data from about page
|
||||
const aboutPageButtonSelector = 'li.InjectLayout-sc-1i43xsx-0:nth-child(2) > a:nth-child(1) > div:nth-child(1) > div:nth-child(1) > p:nth-child(1)'
|
||||
await page.click(aboutPageButtonSelector)
|
||||
}
|
||||
await page.waitForSelector('.kuAEke')
|
||||
|
||||
const followersSelector = '.kuAEke'
|
||||
const followers = await page.$eval(followersSelector, element => element.innerHTML)
|
||||
aboutData.followersAbbv = followers
|
||||
aboutData.followers = this.abbreviatedNumberToNumber(followers)
|
||||
|
||||
const aboutSectionSelector = '.kLFSJC'
|
||||
const aboutSection = await page.$eval(aboutSectionSelector, element => element.innerHTML)
|
||||
aboutData.about = aboutSection
|
||||
|
||||
const socialSelector = '.ccXeNc * a'
|
||||
const socials: Socials[] = await page.$$eval(socialSelector, elements => elements.map((el) => {
|
||||
|
||||
const getHostName = (url: string) => {
|
||||
const match = url.match(/:\/\/(www[0-9]?\.)?(.[^/:]+)/i);
|
||||
if (match != null && match.length > 2 && typeof match[2] === 'string' && match[2].length > 0) {
|
||||
const hostname = match[2].split(".");
|
||||
return hostname[0];
|
||||
}
|
||||
else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
const validHosts = ['instagram', 'youtube', 'discord', 'tiktok','twitter']
|
||||
const socialHost = getHostName(el.href) || el.href || ''
|
||||
let type: string | null = socialHost
|
||||
if(!validHosts.includes(socialHost))
|
||||
type = null
|
||||
|
||||
return {
|
||||
type,
|
||||
link: el.href,
|
||||
text: el.innerText
|
||||
}
|
||||
}))
|
||||
aboutData.socials = socials
|
||||
|
||||
const profilePictureSelector = 'figure.ScAvatar-sc-144b42z-0:nth-child(2) > img:nth-child(1)'
|
||||
const profilePicutre = await page.$eval(profilePictureSelector, element => element.getAttribute('src'))
|
||||
aboutData.pfp = profilePicutre
|
||||
|
||||
return aboutData as StreamerData
|
||||
}
|
||||
|
||||
public getStreamerData = async (username: string): Promise<StreamerData> => {
|
||||
let recoveredData: LooseObject = {}
|
||||
let isLive = await this.isLive(username)
|
||||
|
||||
await this.withBrowser(async (browser: Browser) => {
|
||||
const result = await this.withPage(browser)(async (page: Page) => {
|
||||
const res = await page.goto(`https://twitch.tv/${username}`)
|
||||
|
||||
if(!res?.ok()) {
|
||||
return null
|
||||
} else {
|
||||
return Promise.all([this.getStreamData(page, isLive), this.getAboutData(page, isLive)])
|
||||
}
|
||||
})
|
||||
|
||||
recoveredData = result[1]
|
||||
recoveredData.stream = result[0]
|
||||
if(result[0] !== null) recoveredData.isLive = true
|
||||
|
||||
await browser.close()
|
||||
})
|
||||
|
||||
|
||||
// add final information
|
||||
if(recoveredData && recoveredData.stream && isLive)
|
||||
recoveredData.stream.qualities = await this.getQualities(username)
|
||||
|
||||
if(recoveredData) {
|
||||
recoveredData.isLive = isLive
|
||||
}
|
||||
|
||||
recoveredData.username = username
|
||||
return recoveredData as StreamerData
|
||||
}
|
||||
|
||||
public isLive = async (username: string) => {
|
||||
const streamlink = new Streamlink(`https://twitch.tv/${username}`, {})
|
||||
return await streamlink.isLive()
|
||||
}
|
||||
|
||||
public getQualities = async (username: string) => {
|
||||
const streamlink = new Streamlink(`https://twitch.tv/${username}`, {})
|
||||
return await streamlink.getQualities()
|
||||
}
|
||||
|
||||
}
|
Loading…
Add table
Reference in a new issue