// Copyright (C) 2019-2020 Evgeny Kuznetsov (evgeny@kuznetsov.md) // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program. If not, see . package main import ( "bytes" "flag" "fmt" "io/ioutil" "log" "net/http" "net/url" "regexp" "strconv" "strings" "sync" "time" "github.com/PuerkitoBio/goquery" "github.com/gorilla/feeds" ) type subst struct { from string to string } var ( substitutes = []subst{ // these need to be changed to show up properly in the feed {from: `"`, to: `"`}, {from: `–`, to: `–`}, } programNameRe = regexp.MustCompile(`

(.+?)?

`) programAboutRe = regexp.MustCompile(`(?s)
(.+?)?
`) programImageRe = regexp.MustCompile(`(?s)
(.+?)?`) episodeTitleRe = regexp.MustCompile(`title brand\-menu\-link">(.+?)?`) episodeUrlRe = regexp.MustCompile(` 1 { return errBadEpisode } url, err := parseSingle(episode, episodeUrlRe) if err != nil { return errBadEpisode } episodeUrl := urlPrefix + string(url) title, _ := parseSingle(episode, episodeTitleRe) episodeTitle := string(title) enclosure := findEnclosure(episode) date := findDate(episode) feed.Add(&feeds.Item{ Id: episodeID(episodeUrl), Link: &feeds.Link{Href: episodeUrl}, Title: episodeTitle, Enclosure: enclosure, Created: date, }) } } return } func populateSmotrimEpisodes(feed *feeds.Feed, page []byte) (err error) { doc, err := goquery.NewDocumentFromReader(bytes.NewReader(page)) if err != nil { return } base, err := url.Parse(feed.Link.Href) if err != nil { return } doc.Find(".episode-card").Each(func(i int, s *goquery.Selection) { l, _ := s.Find(".episode-card__link").Attr("href") id := strings.TrimPrefix(l, "/audio/") link, err := base.Parse(l) if err != nil { return } title := strings.TrimSpace(strings.TrimPrefix(s.Find(".episode-card__title").Text(), s.Find(".episode-card__title__brand").Text())) feed.Add(&feeds.Item{ Id: id, Link: &feeds.Link{Href: link.String()}, Title: title, Enclosure: enclosure(id), }) }) return } func parseSite(feed *feeds.Feed) string { u, err := url.Parse(feed.Link.Href) if err != nil { return "" } return u.Hostname() } func parseProgrammeTitle(page []byte) (title string, err error) { t, err := parseSingle(page, programNameRe) if err != nil { return } title = stripLink(string(t)) return } func parseText(page []byte, sel string) (title string, err error) { doc, err := goquery.NewDocumentFromReader(bytes.NewReader(page)) if err != nil { return } title = strings.TrimSpace(doc.Find(sel).Text()) return } func addFeedImage(page []byte, feed *feeds.Feed) { doc, err := goquery.NewDocumentFromReader(bytes.NewReader(page)) if err != nil { return } img := doc.Find(".brand-main-item__picture").Find("img") if src, ok := img.Attr("src"); ok { t, _ := img.Attr("title") feed.Image = &feeds.Image{ Link: feed.Link.Href, Url: src, Title: t, } return } programImage, err := parse(page, programImageRe, 4) if err == nil { feed.Image = &feeds.Image{ Link: feed.Link.Href, Url: string(programImage[1]), Title: string(programImage[3]), } } } func parse(src []byte, re *regexp.Regexp, n int) (out [][]byte, err error) { match := re.FindSubmatch(src) if len(match) != n+1 { for i := 0; i < n; i++ { out = append(out, []byte{}) } return out, errCantParse } return match[1:], nil } func parseSingle(src []byte, re *regexp.Regexp) (out []byte, err error) { got, err := parse(src, re, 1) out = got[0] return } func findDate(ep []byte) time.Time { episodeDateRe := regexp.MustCompile(`brand\-time brand\-menu\-link">(.+?)?\.(.+?)?\.(.+?)? в (.+?)?:(.+?)?`) dateBytes := episodeDateRe.FindSubmatch(ep) return parseDate(dateBytes) } func parseDate(bytes [][]byte) time.Time { if len(bytes) < 4 { return time.Date(1970, time.January, 1, 0, 0, 0, 0, moscow) } var date [5]int for i, b := range bytes[1:] { d, err := strconv.Atoi(string(b)) if err != nil { return time.Date(1970, time.January, 1, 0, 0, 0, 0, moscow) } date[i] = d } return time.Date(date[2], time.Month(date[1]), date[0], date[3], date[4], 0, 0, moscow) } func findEnclosure(ep []byte) *feeds.Enclosure { re := regexp.MustCompile(`data\-type="audio"\s+data\-id="(.+?)?">`) res, err := parseSingle(ep, re) if err != nil { return &feeds.Enclosure{} } return enclosure(string(res)) } func enclosure(no string) *feeds.Enclosure { url := "https://audio.vgtrk.com/download?id=" + string(no) return &feeds.Enclosure{ Url: url, Length: "1024", Type: "audio/mpeg", } } func findEpisodes(page []byte) [][]byte { episodeRe := regexp.MustCompile(`(?s)
(.+?)?data-id="(.+?)">
`) episodes := episodeRe.FindAll(page, -1) return episodes } func describeFeed(feed *feeds.Feed, wg *sync.WaitGroup) { defer wg.Done() url := strings.TrimSuffix(feed.Link.Href, "episodes") + "about" page, _ := getPage(url) desc, err := processFeedDesc(page) if err != nil { log.Printf("could not find programme description on page %v: %v", url, err) } feed.Description = desc } func processFeedDesc(page []byte) (string, error) { res, err := parseSingle(page, programAboutRe) if err != nil { return "", err } re := regexp.MustCompile(`<(.+?)?>`) return string(re.ReplaceAll(res, []byte(``))), err } func describeEpisodes(feed *feeds.Feed) { var wg sync.WaitGroup for _, item := range feed.Items { wg.Add(1) go describeEpisode(item, &wg) } wg.Wait() } func describeEpisode(item *feeds.Item, wg *sync.WaitGroup) { defer wg.Done() page, _ := getPage(item.Link.Href) desc, err := processEpisodeDesc(page) if err != nil { log.Printf("could not find episode description on page %v: %v", item.Link.Href, err) } item.Description = desc if item.Created.IsZero() { item.Created = parseSmotrimDate(page) } } func parseSmotrimDate(page []byte) (t time.Time) { s, err := parseText(page, ".video__date") if err != nil { return } mnths := [12]string{"января", "февраля", "марта", "апреля", "мая", "июня", "июля", "августа", "сентября", "октября", "ноября", "декабря"} for i, mnt := range mnths { s = strings.ReplaceAll(s, mnt, strconv.Itoa(i+1)) } s = fmt.Sprintf("%s z+03", s) t, _ = time.Parse("2 1 2006, 15:04 z-07", s) return } func processEpisodeDesc(page []byte) (string, error) { doc, err := goquery.NewDocumentFromReader(bytes.NewReader(page)) if err != nil { return "", err } var r []string r = addText(r, doc.Find(".brand-episode__head").Find(".anons").Text()) r = addText(r, doc.Find(".brand-episode__body").Find(".body").Text()) r = addText(r, strings.TrimSpace(doc.Find(".video__body").Text())) res := strings.Join(r, "\n\n") if res == "" { return "", errCantParse } return res, err } func addText(arr []string, str string) []string { if str != "" { arr = append(arr, str) } return arr } func getPage(pageUrl string) ([]byte, string) { client := &http.Client{} req, err := http.NewRequest("GET", pageUrl, nil) if err != nil { log.Fatal(err) } req.Header.Add("User-Agent", `Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.27 Safari/537.36`) res, err := client.Do(req) if err != nil { log.Fatal(err) } defer res.Body.Close() page, err := ioutil.ReadAll(res.Body) if err != nil { log.Fatal(err) } page = cleanText(page) return page, res.Request.URL.String() } // cleanText replaces HTML-encoded symbols with proper UTF func cleanText(b []byte) []byte { for _, sub := range substitutes { re := regexp.MustCompile(sub.from) b = re.ReplaceAll(b, []byte(sub.to)) } return b } // episodeURLPrefix derives common episode URL prefix from programme page URL func episodeURLPrefix(url string) string { return strings.Split(url, "/brand/")[0] + "/brand/" } // episodeID generates episode ID from episode URL, // changes "https://" to "http://" for backwards compatibility purposes func episodeID(url string) string { if strings.HasPrefix(url, "https://") { return "http://" + strings.TrimPrefix(url, "https://") } return url } // stripLink strips string of tags func stripLink(s string) string { re := regexp.MustCompile(``) return re.ReplaceAllString(s, "") }