chore: 🚧 remove cinemagoer and fiber and add new techinque for scraping

This commit is contained in:
2024-11-06 01:59:13 +01:00
parent 6d88e96864
commit cba9dd3ffc
46 changed files with 1179 additions and 1764 deletions
+90
View File
@@ -0,0 +1,90 @@
package handlers
import (
"log/slog"
"net/http"
"strings"
"gopher-toolbox/config"
"github.com/gin-gonic/gin"
"github.com/zepyrshut/rating-orama/internal/repository"
)
// TODO: Extract to toolbox
const (
InvalidRequest string = "invalid_request"
InternalError string = "internal_error"
RequestID string = "request_id"
NotFound string = "not_found"
Created string = "created"
Updated string = "updated"
Deleted string = "deleted"
Enabled string = "enabled"
Disabled string = "disabled"
Retrieved string = "retrieved"
ErrorCreating string = "error_creating"
ErrorUpdating string = "error_updating"
ErrorEnabling string = "error_enabling"
ErrorDisabling string = "error_disabling"
ErrorGetting string = "error_getting"
ErrorGettingAll string = "error_getting_all"
InvalidEntityID string = "invalid_entity_id"
NotImplemented string = "not_implemented"
UserUsernameKey string = "user_username_key"
UserEmailKey string = "user_email_key"
UsernameAlReadyExists string = "username_already_exists"
EmailAlreadyExists string = "email_already_exists"
IncorrectPassword string = "incorrect_password"
ErrorGeneratingToken string = "error_generating_token"
LoggedIn string = "logged_in"
CategoryNameKey string = "category_name_key"
CategoryAlreadyExists string = "category_already_exists"
ItemsNameKey string = "items_name_key"
NameAlreadyExists string = "name_already_exists"
)
type Handlers struct {
App *config.App
Queries repository.ExtendedQuerier
}
func New(q repository.ExtendedQuerier, app *config.App) *Handlers {
return &Handlers{
Queries: q,
App: app,
}
}
func (hq *Handlers) ToBeImplemented(c *gin.Context) {
c.JSON(http.StatusOK, gin.H{
"message": "Not implemented yet",
})
}
func (hq *Handlers) Ping(c *gin.Context) {
c.JSON(http.StatusOK, gin.H{
"message": "pong",
})
}
// TODO: Extract to toolbox
func handleQueryError(c *gin.Context, err error, errorMap map[string]string, logMessage string, defaultErrorMessage string) bool {
if err != nil {
for key, message := range errorMap {
if strings.Contains(err.Error(), key) {
slog.Error(logMessage, "error", message, RequestID, c.Request.Context().Value(RequestID))
c.JSON(http.StatusConflict, gin.H{"error": message})
return true
}
}
slog.Error(logMessage, "error", err.Error(), RequestID, c.Request.Context().Value(RequestID))
c.JSON(http.StatusInternalServerError, gin.H{"error": defaultErrorMessage})
return true
}
return false
}
+46
View File
@@ -0,0 +1,46 @@
package handlers
//func (hq *Handlers) GetAllChapters(c *fiber.Ctx) error {
// tvShow := models.TvShow{}
// ttShowID := c.Query("id")
// if ttShowID[0:2] == "tt" {
// ttShowID = ttShowID[2:]
// }
// exist := hq.DB.CheckIfTvShowExists(ttShowID)
// if !exist {
// url := fmt.Sprintf(hq.App.Environment.HarvesterApi, ttShowID)
// response, _ := http.Get(url)
// body, _ := io.ReadAll(response.Body)
// err := json.Unmarshal(body, &tvShow)
// if err != nil {
// hq.App.Error(err.Error())
// return c.Status(http.StatusInternalServerError).JSON(err)
// }
// err = hq.DB.InsertTvShow(tvShow)
// if err != nil {
// hq.App.Error(err.Error())
// return c.Status(http.StatusInternalServerError).JSON(err)
// }
// }
// tvShow, err := hq.DB.FetchTvShow(ttShowID)
// if err != nil {
// hq.App.Error(err.Error())
// return c.Status(http.StatusInternalServerError).JSON(err)
// }
// tvShowJSON, err := json.Marshal(tvShow)
// if err != nil {
// hq.App.Error(err.Error())
// return c.Status(http.StatusInternalServerError).JSON(err)
// }
// return c.Render("charts", fiber.Map{
// "TvShow": tvShow,
// "TvShowJSON": string(tvShowJSON),
// })
//}
+68
View File
@@ -0,0 +1,68 @@
package models
// import (
// "strconv"
// "time"
// )
// type Popularity struct {
// ShowID string `json:"show_id"`
// TimesViewed int `json:"times_viewed"`
// }
// type TvShow struct {
// ShowID string `json:"show_id"`
// Title string `json:"title"`
// Runtime int `json:"runtime"`
// Votes int `json:"votes"`
// AvgRating float64 `json:"avg_rating"`
// MedianRating float64 `json:"median_rating"`
// Seasons []Season `json:"seasons"`
// }
// type Season struct {
// Number int `json:"number"`
// AvgRating float64 `json:"avg_rating"`
// MedianRating float64 `json:"median_rating"`
// Votes int `json:"votes"`
// Episodes []Episode `json:"episodes"`
// }
// type Episode struct {
// Number int `json:"number"`
// EpisodeID string `json:"episode_id"`
// Title string `json:"title"`
// Aired time.Time `json:"aired"`
// AvgRating float64 `json:"avg_rating"`
// Votes int `json:"votes"`
// }
// func (tvShow *TvShow) TvShowBuilder(tvShowDTO TvShowDTO) {
// tvShow.ShowID = tvShowDTO.ShowID
// tvShow.Title = tvShowDTO.Title
// tvShow.Runtime, _ = strconv.Atoi(tvShowDTO.Runtime)
// lastSeasonNumber := tvShowDTO.Episodes[len(tvShowDTO.Episodes)-1].SeasonID
// if lastSeasonNumber == -1 {
// lastSeasonNumber = tvShowDTO.Episodes[len(tvShowDTO.Episodes)-2].SeasonID
// }
// seasons := make([]Season, lastSeasonNumber)
// for currentSeason := 1; currentSeason <= lastSeasonNumber; currentSeason++ {
// for _, episode := range tvShowDTO.Episodes {
// if episode.SeasonID == currentSeason {
// seasons[currentSeason-1].Number = currentSeason
// seasons[currentSeason-1].Episodes = append(seasons[currentSeason-1].Episodes, Episode{
// Number: episode.Number,
// EpisodeID: episode.EpisodeID,
// Title: episode.Title,
// Aired: episode.Aired.Time,
// AvgRating: episode.AvgRating,
// Votes: episode.Votes,
// })
// }
// }
// }
// tvShow.Seasons = seasons
// }
+52
View File
@@ -0,0 +1,52 @@
package models
// type TvShowDTO struct {
// ShowID string `json:"tt_show_id"`
// Title string `json:"title"`
// Runtime string `json:"runtime"`
// Episodes []EpisodeDTO `json:"episodes"`
// }
// type EpisodeDTO struct {
// Number int `json:"number"`
// SeasonID int `json:"season_id"`
// EpisodeID string `json:"tt_episode_id"`
// Title string `json:"title"`
// Aired AiredTime `json:"aired"`
// AvgRating float64 `json:"avg_rating"`
// Votes int `json:"votes"`
// }
// type AiredTime struct {
// time.Time
// }
// func (tvShow *TvShow) UnmarshalJSON(data []byte) error {
// var tvShowDTO TvShowDTO
// err := json.Unmarshal(data, &tvShowDTO)
// if err != nil {
// return err
// }
// tvShow.TvShowBuilder(tvShowDTO)
// return nil
// }
// func (aired *AiredTime) UnmarshalJSON(data []byte) error {
// if string(data) == "null" || string(data) == "" {
// return nil
// }
// var s string
// if err := json.Unmarshal(data, &s); err != nil {
// return nil
// }
// t, err := utils.TimeParser(s)
// if err != nil {
// return err
// }
// aired.Time = t
// return nil
// }
+40
View File
@@ -0,0 +1,40 @@
package repository
import (
"context"
"log/slog"
"github.com/jackc/pgx/v5"
"github.com/jackc/pgx/v5/pgxpool"
"github.com/zepyrshut/rating-orama/internal/sqlc"
)
type pgxRepository struct {
*sqlc.Queries
db *pgxpool.Pool
}
func NewPGXRepo(db *pgxpool.Pool) ExtendedQuerier {
return &pgxRepository{
Queries: sqlc.New(db),
db: db,
}
}
func (r *pgxRepository) execTx(ctx context.Context, txFunc func(tx pgx.Tx) error) error {
slog.Info("starting transaction", "txFunc", txFunc)
tx, err := r.db.Begin(ctx)
if err != nil {
slog.Error("failed to start transaction", "error", err)
return err
}
defer tx.Rollback(ctx)
if err := txFunc(tx); err != nil {
slog.Error("failed to execute transaction", "error", err)
return err
}
slog.Info("committing transaction", "txFunc", txFunc)
return tx.Commit(ctx)
}
+9
View File
@@ -0,0 +1,9 @@
package repository
import (
"github.com/zepyrshut/rating-orama/internal/sqlc"
)
type ExtendedQuerier interface {
sqlc.Querier
}
+129
View File
@@ -0,0 +1,129 @@
package scraper
import (
"fmt"
"log/slog"
"regexp"
"sort"
"strconv"
"strings"
"time"
"github.com/gocolly/colly"
)
type Episode struct {
Season int
Episode int
Released time.Time
Name string
Plot string
Rate float64
VoteCount int
}
type Season []Episode
const seasonsSelector = "ul.ipc-tabs a[data-testid='tab-season-entry']"
const episodesSelector = "section.sc-1e7f96be-0.ZaQIL"
const nextSeasonButtonSelector = "#next-season-btn"
const imdbEpisodesURL = "https://www.imdb.com/title/%s/episodes?season=%d"
func scrapeSeasons(ttImdb string) {
c := colly.NewCollector(
colly.AllowedDomains("imdb.com", "www.imdb.com"),
)
var allEpisodes []Episode
var seasons []int
c.OnHTML("ul.ipc-tabs a[data-testid='tab-season-entry']", func(e *colly.HTMLElement) {
seasonText := strings.TrimSpace(e.Text)
seasonNum, err := strconv.Atoi(seasonText)
if err == nil {
seasons = append(seasons, seasonNum)
}
})
c.OnScraped(func(r *colly.Response) {
seasonMap := make(map[int]bool)
uniqueSeasons := []int{}
for _, seasonNum := range seasons {
if !seasonMap[seasonNum] {
seasonMap[seasonNum] = true
uniqueSeasons = append(uniqueSeasons, seasonNum)
}
}
sort.Ints(uniqueSeasons)
episodeCollector := c.Clone()
episodeCollector.OnHTML(episodesSelector, func(e *colly.HTMLElement) {
seasonEpisodes := extractEpisodesFromSeason(e.Text)
allEpisodes = append(allEpisodes, seasonEpisodes...)
})
for _, seasonNum := range uniqueSeasons {
seasonURL := fmt.Sprintf(imdbEpisodesURL, ttImdb, seasonNum)
slog.Info("visiting %s", seasonURL)
episodeCollector.Visit(seasonURL)
}
episodeCollector.Wait()
// fmt.Println("Total de episodios:", len(allEpisodes))
// for _, episode := range allEpisodes {
// fmt.Printf("Temporada %d, Episodio %d: %s\n", episode.Season, episode.Episode, episode.Name)
// }
// TODO: Save to DB
})
c.Visit("https://www.imdb.com/title/tt0903747/episodes")
c.Wait()
}
func extractEpisodesFromSeason(data string) Season {
const pattern = `(S\d+\.E\d+)\s∙\s(.*?)` +
`(Mon|Tue|Wed|Thu|Fri|Sat|Sun),\s` +
`(.*?),\s(\d{4})(.*?)` +
`(\d\.\d{1,2}\/10) \((\d+K)\)Rate`
re := regexp.MustCompile(pattern)
matches := re.FindAllStringSubmatch(data, -1)
episodes := make([]Episode, 0, len(matches))
for _, match := range matches {
var episode Episode
seasonEpisode := match[1]
name := strings.TrimSpace(match[2])
day := match[3]
dateRest := strings.TrimSpace(match[4])
year := match[5]
plot := strings.TrimSpace(match[6])
rate := match[7]
voteCount := match[8]
seasonNum := strings.TrimPrefix(strings.Split(seasonEpisode, ".")[0], "S")
episodeNum := strings.TrimPrefix(strings.Split(seasonEpisode, ".")[1], "E")
votes, _ := strconv.Atoi(strings.TrimSuffix(strings.TrimSuffix(voteCount, "K"), "K"))
episode.Name = name
episode.Episode, _ = strconv.Atoi(episodeNum)
episode.Season, _ = strconv.Atoi(seasonNum)
episode.Released, _ = time.Parse("Mon, Jan 2, 2006", fmt.Sprintf("%s, %s, %s", day, dateRest, year))
episode.Plot = plot
episode.Rate, _ = strconv.ParseFloat(strings.TrimSuffix(rate, "/10"), 2)
episode.VoteCount = votes * 1000
episodes = append(episodes, episode)
}
return episodes
}
+32
View File
@@ -0,0 +1,32 @@
// Code generated by sqlc. DO NOT EDIT.
// versions:
// sqlc v1.27.0
package sqlc
import (
"context"
"github.com/jackc/pgx/v5"
"github.com/jackc/pgx/v5/pgconn"
)
type DBTX interface {
Exec(context.Context, string, ...interface{}) (pgconn.CommandTag, error)
Query(context.Context, string, ...interface{}) (pgx.Rows, error)
QueryRow(context.Context, string, ...interface{}) pgx.Row
}
func New(db DBTX) *Queries {
return &Queries{db: db}
}
type Queries struct {
db DBTX
}
func (q *Queries) WithTx(tx pgx.Tx) *Queries {
return &Queries{
db: tx,
}
}
+30
View File
@@ -0,0 +1,30 @@
// Code generated by sqlc. DO NOT EDIT.
// versions:
// sqlc v1.27.0
package sqlc
import (
"github.com/jackc/pgx/v5/pgtype"
)
type Episode struct {
ID int32 `json:"id"`
TvShowID int32 `json:"tv_show_id"`
Season int32 `json:"season"`
Episode int32 `json:"episode"`
Released pgtype.Date `json:"released"`
Name string `json:"name"`
Plot string `json:"plot"`
AvgRating pgtype.Numeric `json:"avg_rating"`
VoteCount int32 `json:"vote_count"`
}
type TvShow struct {
ID int32 `json:"id"`
Name string `json:"name"`
TtImdb string `json:"tt_imdb"`
Popularity int32 `json:"popularity"`
CreatedAt pgtype.Timestamp `json:"created_at"`
UpdatedAt pgtype.Timestamp `json:"updated_at"`
}
+22
View File
@@ -0,0 +1,22 @@
// Code generated by sqlc. DO NOT EDIT.
// versions:
// sqlc v1.27.0
package sqlc
import (
"context"
)
type Querier interface {
CheckTVShowExists(ctx context.Context, ttImdb string) (TvShow, error)
CreateEpisodes(ctx context.Context, arg CreateEpisodesParams) (Episode, error)
CreateTVShow(ctx context.Context, arg CreateTVShowParams) (TvShow, error)
GetEpisodes(ctx context.Context, tvShowID int32) ([]Episode, error)
IncreasePopularity(ctx context.Context, id int32) error
SeasonAverageRating(ctx context.Context, arg SeasonAverageRatingParams) (float64, error)
TvShowAverageRating(ctx context.Context, tvShowID int32) (float64, error)
TvShowMedianRating(ctx context.Context, tvShowID int32) (float64, error)
}
var _ Querier = (*Queries)(nil)
+185
View File
@@ -0,0 +1,185 @@
// Code generated by sqlc. DO NOT EDIT.
// versions:
// sqlc v1.27.0
// source: tv_show.sql
package sqlc
import (
"context"
"github.com/jackc/pgx/v5/pgtype"
)
const checkTVShowExists = `-- name: CheckTVShowExists :one
select id, name, tt_imdb, popularity, created_at, updated_at from "tv_show"
where tt_imdb = $1
`
func (q *Queries) CheckTVShowExists(ctx context.Context, ttImdb string) (TvShow, error) {
row := q.db.QueryRow(ctx, checkTVShowExists, ttImdb)
var i TvShow
err := row.Scan(
&i.ID,
&i.Name,
&i.TtImdb,
&i.Popularity,
&i.CreatedAt,
&i.UpdatedAt,
)
return i, err
}
const createEpisodes = `-- name: CreateEpisodes :one
insert into "episodes" (tv_show_id, season, episode, released, name, plot, avg_rating, vote_count)
values ($1, $2, $3, $4, $5, $6, $7, $8)
returning id, tv_show_id, season, episode, released, name, plot, avg_rating, vote_count
`
type CreateEpisodesParams struct {
TvShowID int32 `json:"tv_show_id"`
Season int32 `json:"season"`
Episode int32 `json:"episode"`
Released pgtype.Date `json:"released"`
Name string `json:"name"`
Plot string `json:"plot"`
AvgRating pgtype.Numeric `json:"avg_rating"`
VoteCount int32 `json:"vote_count"`
}
func (q *Queries) CreateEpisodes(ctx context.Context, arg CreateEpisodesParams) (Episode, error) {
row := q.db.QueryRow(ctx, createEpisodes,
arg.TvShowID,
arg.Season,
arg.Episode,
arg.Released,
arg.Name,
arg.Plot,
arg.AvgRating,
arg.VoteCount,
)
var i Episode
err := row.Scan(
&i.ID,
&i.TvShowID,
&i.Season,
&i.Episode,
&i.Released,
&i.Name,
&i.Plot,
&i.AvgRating,
&i.VoteCount,
)
return i, err
}
const createTVShow = `-- name: CreateTVShow :one
insert into "tv_show" (name, tt_imdb)
values ($1, $2)
returning id, name, tt_imdb, popularity, created_at, updated_at
`
type CreateTVShowParams struct {
Name string `json:"name"`
TtImdb string `json:"tt_imdb"`
}
func (q *Queries) CreateTVShow(ctx context.Context, arg CreateTVShowParams) (TvShow, error) {
row := q.db.QueryRow(ctx, createTVShow, arg.Name, arg.TtImdb)
var i TvShow
err := row.Scan(
&i.ID,
&i.Name,
&i.TtImdb,
&i.Popularity,
&i.CreatedAt,
&i.UpdatedAt,
)
return i, err
}
const getEpisodes = `-- name: GetEpisodes :many
select id, tv_show_id, season, episode, released, name, plot, avg_rating, vote_count from "episodes"
where tv_show_id = $1
`
func (q *Queries) GetEpisodes(ctx context.Context, tvShowID int32) ([]Episode, error) {
rows, err := q.db.Query(ctx, getEpisodes, tvShowID)
if err != nil {
return nil, err
}
defer rows.Close()
items := []Episode{}
for rows.Next() {
var i Episode
if err := rows.Scan(
&i.ID,
&i.TvShowID,
&i.Season,
&i.Episode,
&i.Released,
&i.Name,
&i.Plot,
&i.AvgRating,
&i.VoteCount,
); err != nil {
return nil, err
}
items = append(items, i)
}
if err := rows.Err(); err != nil {
return nil, err
}
return items, nil
}
const increasePopularity = `-- name: IncreasePopularity :exec
update "tv_show" set popularity = popularity + 1
where id = $1
`
func (q *Queries) IncreasePopularity(ctx context.Context, id int32) error {
_, err := q.db.Exec(ctx, increasePopularity, id)
return err
}
const seasonAverageRating = `-- name: SeasonAverageRating :one
select avg(avg_rating) from "episodes"
where tv_show_id = $1 and season = $2
`
type SeasonAverageRatingParams struct {
TvShowID int32 `json:"tv_show_id"`
Season int32 `json:"season"`
}
func (q *Queries) SeasonAverageRating(ctx context.Context, arg SeasonAverageRatingParams) (float64, error) {
row := q.db.QueryRow(ctx, seasonAverageRating, arg.TvShowID, arg.Season)
var avg float64
err := row.Scan(&avg)
return avg, err
}
const tvShowAverageRating = `-- name: TvShowAverageRating :one
select avg(avg_rating) from "episodes"
where tv_show_id = $1
`
func (q *Queries) TvShowAverageRating(ctx context.Context, tvShowID int32) (float64, error) {
row := q.db.QueryRow(ctx, tvShowAverageRating, tvShowID)
var avg float64
err := row.Scan(&avg)
return avg, err
}
const tvShowMedianRating = `-- name: TvShowMedianRating :one
select percentile_cont(0.5) within group (order by avg_rating) from "episodes"
where tv_show_id = $1
`
func (q *Queries) TvShowMedianRating(ctx context.Context, tvShowID int32) (float64, error) {
row := q.db.QueryRow(ctx, tvShowMedianRating, tvShowID)
var percentile_cont float64
err := row.Scan(&percentile_cont)
return percentile_cont, err
}