Working on scraping videos

This commit is contained in:
Sebastiaan de Schaetzen 2025-02-01 19:39:55 +01:00
parent 1d0fd243fe
commit 27c58d06e0
4 changed files with 95 additions and 96 deletions

6
go.mod
View File

@ -7,5 +7,11 @@ require github.com/mattn/go-sqlite3 v1.14.24
require (
github.com/PuerkitoBio/goquery v1.10.1 // indirect
github.com/andybalholm/cascadia v1.3.3 // indirect
github.com/deckarep/golang-set/v2 v2.7.0 // indirect
github.com/go-jose/go-jose/v3 v3.0.3 // indirect
github.com/go-stack/stack v1.8.1 // indirect
github.com/headzoo/surf v1.0.1 // indirect
github.com/playwright-community/playwright-go v0.4902.0 // indirect
golang.org/x/net v0.33.0 // indirect
gopkg.in/headzoo/surf.v1 v1.0.1 // indirect
)

19
go.sum
View File

@ -2,9 +2,24 @@ github.com/PuerkitoBio/goquery v1.10.1 h1:Y8JGYUkXWTGRB6Ars3+j3kN0xg1YqqlwvdTV8W
github.com/PuerkitoBio/goquery v1.10.1/go.mod h1:IYiHrOMps66ag56LEH7QYDDupKXyo5A8qrjIx3ZtujY=
github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM=
github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/deckarep/golang-set/v2 v2.7.0 h1:gIloKvD7yH2oip4VLhsv3JyLLFnC0Y2mlusgcvJYW5k=
github.com/deckarep/golang-set/v2 v2.7.0/go.mod h1:VAky9rY/yGXJOLEDv3OMci+7wtDpOF4IN+y82NBOac4=
github.com/go-jose/go-jose/v3 v3.0.3 h1:fFKWeig/irsp7XD2zBxvnmA/XaRWp5V3CBsZXJF7G7k=
github.com/go-jose/go-jose/v3 v3.0.3/go.mod h1:5b+7YgP7ZICgJDBdfjZaIt+H/9L9T/YQrVfLAMboGkQ=
github.com/go-stack/stack v1.8.1 h1:ntEHSVwIt7PNXNpgPmVfMrNhLtgjlmnZha2kOpuRiDw=
github.com/go-stack/stack v1.8.1/go.mod h1:dcoOX6HbPZSZptuspn9bctJ+N/CnF5gGygcUP3XYfe4=
github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/headzoo/surf v1.0.1 h1:wk3+LT8gjnCxEwfBJl6MhaNg154En5KjgmgzAG9uMS0=
github.com/headzoo/surf v1.0.1/go.mod h1:/bct0m/iMNEqpn520y01yoaWxsAEigGFPnvyR1ewR5M=
github.com/mattn/go-sqlite3 v1.14.24 h1:tpSp2G2KyMnnQu99ngJ47EIkWVmliIizyZBfPrBWDRM=
github.com/mattn/go-sqlite3 v1.14.24/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
github.com/playwright-community/playwright-go v0.4902.0 h1:SslPUKmc35YgTBZKTLhokxrqTsVk3/mirj+TkqR6dC0=
github.com/playwright-community/playwright-go v0.4902.0/go.mod h1:kBNWs/w2aJ2ZUp1wEOOFLXgOqvppFngM5OS+qyhl+ZM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
@ -70,3 +85,7 @@ golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58=
golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/headzoo/surf.v1 v1.0.1 h1:oDBy9b5NlTb2Hvl3hF8NN+Qy7ypC9/g5YDP85pPh13k=
gopkg.in/headzoo/surf.v1 v1.0.1/go.mod h1:T0BH8276y+OPL0E4tisxCFjBVIAKGbwdYU7AS7/EpQQ=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

37
main.go
View File

@ -1,38 +1,35 @@
package main
import (
"github.com/playwright-community/playwright-go"
"log"
"os"
)
func main() {
options := &playwright.RunOptions{
Browsers: []string{"chromium"},
}
err := playwright.Install(options)
if err != nil {
log.Panicf("error installing playwright: %w", err)
}
db := openDatabase()
defer db.Close()
username := os.Getenv("VIVAPLUS_USER")
password := os.Getenv("VIVAPLUS_PASS")
w := NewWebClient(options)
//username := os.Getenv("VIVAPLUS_USER")
////password := os.Getenv("VIVAPLUS_PASS")
//password, err := base64.StdEncoding.DecodeString(os.Getenv("VIVAPLUS_PASS"))
//if err != nil {
// log.Fatalf("error decoding password: %v", err)
//}
w := NewWebClient()
err := w.VivaLogin(username, string(password))
if err != nil {
log.Fatalf("error login in: %v", err)
}
//form := url.Values{}
//form.Set("email", username)
//form.Set("password", string(password))
//
//// First fetch csrf token by doing a get. It is found in a meta tag with name="csrf-token"
//
//resp, err := http.Post("https://vivaplus.tv/supporters/sign_in", "application/x-www-form-urlencoded;charset=UTF-8", strings.NewReader(form.Encode()))
//err = w.VivaLogin(username, string(password))
//if err != nil {
// log.Fatalf("error logging in: %v", err)
// log.Fatalf("error login in: %v", err)
//}
//
//log.Printf("Status code: %d", resp.StatusCode)
////println(resp)
w.VivaFindAllVideos()
}

View File

@ -1,105 +1,82 @@
package main
import (
"errors"
"fmt"
"github.com/PuerkitoBio/goquery"
"github.com/playwright-community/playwright-go"
"log"
"net/http"
"net/http/cookiejar"
"net/url"
"strings"
)
const BASE_URL = "https://vivaplus.tv/"
const SIGN_IN_URL = BASE_URL + "supporters/sign_in"
type WebClient struct {
csrfToken string
cookies *cookiejar.Jar
pw *playwright.Playwright
browser playwright.Browser
//browser *browser.Browser
//csrfToken string
//cookies *cookiejar.Jar
}
func NewWebClient() *WebClient {
jar, err := cookiejar.New(nil)
func NewWebClient(options *playwright.RunOptions) *WebClient {
pw, err := playwright.Run(options)
if err != nil {
log.Fatalf("error creating cookiejar: %w", err)
log.Fatalf("error running playwright: %w", pw)
}
browser, err := pw.Firefox.Launch()
if err != nil {
log.Fatalf("error running firefox: %w", pw)
}
return &WebClient{
cookies: jar,
pw: pw,
browser: browser,
}
}
func (w *WebClient) RegisterCookies(rawUrl string, resp *http.Response) {
u, err := url.Parse(rawUrl)
if err != nil {
log.Fatalf("error parsing url: %v", err)
}
w.cookies.SetCookies(u, resp.Cookies())
}
func (w *WebClient) UseCookies(rawUrl string, req *http.Request) {
u, err := url.Parse(rawUrl)
if err != nil {
log.Fatalf("error parsing url: %v", err)
}
for _, c := range w.cookies.Cookies(u) {
req.AddCookie(c)
}
}
func (w *WebClient) FetchCsrfToken() error {
resp, err := http.Get(SIGN_IN_URL)
if err != nil {
return fmt.Errorf("error getting sign in page: %w", err)
}
defer resp.Body.Close()
doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
return fmt.Errorf("error parsing sign in page: %w", err)
}
w.RegisterCookies(SIGN_IN_URL, resp)
csrfTokenElement := doc.Find("meta[name='csrf-token']").First()
if csrfTokenElement == nil {
return errors.New("error getting csrf token element")
}
var exists bool
w.csrfToken, exists = csrfTokenElement.Attr("content")
if !exists {
return errors.New("error content attribute does not exist")
}
return nil
}
func (w *WebClient) VivaLogin(username, password string) error {
err := w.FetchCsrfToken()
page, err := w.browser.NewPage()
if err != nil {
return err
return fmt.Errorf("error creating page: %w", err)
}
form := url.Values{}
form.Set("email", username)
form.Set("password", password)
println("Encoded form:", form.Encode())
println("CSRF token:", w.csrfToken)
req, err := http.NewRequest("POST", SIGN_IN_URL, strings.NewReader(form.Encode()))
w.UseCookies(SIGN_IN_URL, req)
for _, c := range req.Cookies() {
println("Cookie:", c.Name, "=", c.Value)
}
_, err = page.Goto(SIGN_IN_URL)
if err != nil {
return fmt.Errorf("error creating login request: %w", err)
}
req.Header.Add("x-csrf-token", w.csrfToken)
req.Header.Add("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8")
resp, err := http.DefaultClient.Do(req)
if err != nil {
return fmt.Errorf("error logging in: %w", err)
return fmt.Errorf("error navigating to sign-in page: %w", err)
}
log.Printf("Status code: %d", resp.StatusCode)
err = page.GetByTestId("SupporterLogin.EmailInput").Fill(username)
if err != nil {
return fmt.Errorf("error filling in email: %w", err)
}
err = page.GetByTestId("SupporterLogin.PasswordInput").Fill(password)
if err != nil {
return fmt.Errorf("error filling in email: %w", err)
}
err = page.GetByTestId("SupporterLogin.SubmitButton").Click()
return nil
}
func (w *WebClient) VivaFindAllVideos() error {
log.Printf("Loading list of all videos...")
page, err := w.browser.NewPage()
if err != nil {
return fmt.Errorf("error creating page: %w", err)
}
_, err = page.Goto("https://vivaplus.tv/supporters/videos/all?order=asc")
if err != nil {
return fmt.Errorf("error opening page: %w", err)
}
count, err := page.GetByTestId("VideoCatalog.Video").Count()
//count, err := el.Count()
println(count, err)
//attr, err := el.GetAttribute("busy")
//println(attr, err)
//err = el.ScrollIntoViewIfNeeded()
//println(err)
//attr, err = el.GetAttribute("busy")
//println(attr, err)
return nil
}