Parse upload date

This commit is contained in:
Sebastiaan de Schaetzen 2025-02-06 07:52:47 +01:00
parent 34023f609b
commit 2c3b06a98b

View File

@ -7,6 +7,7 @@ import (
"github.com/playwright-community/playwright-go"
"log"
"regexp"
"time"
)
const BASE_URL = "https://vivaplus.tv"
@ -86,6 +87,19 @@ func isValidVideoUrl(url string) bool {
return re.MatchString(url)
}
func isBlacklistedUrl(url string) bool {
return url == "/supporters/payments/checkout/posts/63266/available_tiers"
}
func parseDateString(dateStr string) (time.Time, error) {
const layout = "Jan 2, 2006"
t, err := time.Parse(layout, dateStr)
if err != nil {
return time.Time{}, err
}
return t, nil
}
func (w *WebClient) DiscoverAllVideos(db *sql.DB) error {
log.Printf("Loading list of all videos...")
@ -117,20 +131,27 @@ func (w *WebClient) DiscoverAllVideos(db *sql.DB) error {
}
for _, l := range locators {
// Get the URL to the video page
href, err := l.GetAttribute("href")
if err != nil {
return err
}
// Ensure that it's valid and we haven't already scanned it
if _, exists := previousUrls[href]; exists {
continue // The item was already scanned.
}
previousUrls[href] = struct{}{}
if isBlacklistedUrl(href) {
continue // We want to skip this one
}
if !isValidVideoUrl(href) {
return fmt.Errorf("url has bad format: %s", href)
}
previousUrls[href] = struct{}{}
// Insert it into the database
result := tx.QueryRow("select count(1) from videos where url = :url", href)
var count int
err = result.Scan(&count)
@ -148,12 +169,13 @@ func (w *WebClient) DiscoverAllVideos(db *sql.DB) error {
return fmt.Errorf("error inserting into db: %w", err)
}
}
// Scroll to the bottom
log.Printf("Scrolling...")
err = w.page.Keyboard().Press("End")
if err != nil {
return fmt.Errorf("error scrolling to end of page: %w", err)
}
//w.page.tim
}
finish:
err = tx.Commit()
@ -163,6 +185,11 @@ finish:
return nil
}
func isRelativeTimeFormat(input string) bool {
re := regexp.MustCompile(`^\d+[mhs]\s+ago$`)
return re.MatchString(input)
}
func (w *WebClient) FetchVideoMetadata(db *sql.DB) error {
log.Printf("Fetching video metadata...")
for {
@ -198,6 +225,19 @@ func (w *WebClient) FetchVideoMetadata(db *sql.DB) error {
return fmt.Errorf("error retrieving description: %w", err)
}
// Get upload date
uploadDateStr, err := w.getInnerText(".video-page__meta")
if err != nil {
return fmt.Errorf("error retrieving upload date: %w", err)
}
uploadDate := time.Now()
if !isRelativeTimeFormat(uploadDateStr) {
uploadDate, err = parseDateString(uploadDateStr)
if err != nil {
return fmt.Errorf("error parsing date string '%s': %w", uploadDateStr, err)
}
}
// Get cast url
videoElement, err := w.page.QuerySelector("video")
if err != nil {
@ -214,7 +254,7 @@ func (w *WebClient) FetchVideoMetadata(db *sql.DB) error {
return fmt.Errorf("error starting transaction: %w", err)
}
defer tx.Rollback()
result, err := tx.Exec("update videos set title = ?, description = ?, cast = ? where id = ?", title, description, castSource, id)
result, err := tx.Exec("update videos set title = ?, description = ?, cast = ?, upload_date = ? where id = ?", title, description, castSource, uploadDate, id)
if err != nil {
return fmt.Errorf("error updating database: %w", err)
}