From 2c054e0e4dbc94ecff95d62fd8a06b98405ff4d4 Mon Sep 17 00:00:00 2001 From: Evgeny Kuznetsov Date: Wed, 5 Feb 2020 18:48:03 +0300 Subject: parse regexes safely --- main.go | 38 +++++++++++++++++++++++++++++--------- main_test.go | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 9 deletions(-) diff --git a/main.go b/main.go index 9ffdafd..16463b1 100644 --- a/main.go +++ b/main.go @@ -113,12 +113,12 @@ func getFeed(url string) (feed *feeds.Feed) { } func populateFeed(feed *feeds.Feed, page []byte) (err error) { - titleMatch := programNameRe.FindSubmatch(page) - if len(titleMatch) < 1 { - return fmt.Errorf("bad programme page") + title, err := parseSingle(page, programNameRe) + if err != nil { + return fmt.Errorf("bad programme page: title not found") } + feed.Title = stripLink(string(title)) - feed.Title = stripLink(string(titleMatch[1])) programImage := programImageRe.FindSubmatch(page) feed.Image = &feeds.Image{ Link: feed.Link.Href, @@ -133,8 +133,10 @@ func populateFeed(feed *feeds.Feed, page []byte) (err error) { if len(episodeUrlRe.FindAllSubmatch(episode, -1)) > 1 { return errBadEpisode } - episodeUrl := urlPrefix + string(episodeUrlRe.FindSubmatch(episode)[1]) - episodeTitle := string(episodeTitleRe.FindSubmatch(episode)[1]) + url, _ := parseSingle(episode, episodeUrlRe) + episodeUrl := urlPrefix + string(url) + title, _ := parseSingle(episode, episodeTitleRe) + episodeTitle := string(title) enclosure := findEnclosure(episode) date := findDate(episode) @@ -149,6 +151,24 @@ func populateFeed(feed *feeds.Feed, page []byte) (err error) { return nil } +func parse(src []byte, re *regexp.Regexp, n int) (out [][]byte, err error) { + match := re.FindSubmatch(src) + if len(match) != n+1 { + for i := 0; i < n; i++ { + out = append(out, []byte{}) + } + return out, errCantParse + } + + return match[1:], nil +} + +func parseSingle(src []byte, re *regexp.Regexp) (out []byte, err error) { + got, err := parse(src, re, 1) + out = got[0] + return +} + func findDate(ep []byte) time.Time { episodeDateRe := regexp.MustCompile(`brand\-time brand\-menu\-link">(.+?)?\.(.+?)?\.(.+?)? в (.+?)?:(.+?)?`) dateBytes := episodeDateRe.FindSubmatch(ep) @@ -174,12 +194,12 @@ func parseDate(bytes [][]byte) time.Time { func findEnclosure(ep []byte) *feeds.Enclosure { re := regexp.MustCompile(`data\-type="audio"\s+data\-id="(.+?)?">`) - matches := re.FindSubmatch(ep) - if len(matches) < 2 { + res, err := parseSingle(ep, re) + if err != nil { return &feeds.Enclosure{} } - url := "https://audio.vgtrk.com/download?id=" + string(matches[1]) + url := "https://audio.vgtrk.com/download?id=" + string(res) return &feeds.Enclosure{ Url: url, diff --git a/main_test.go b/main_test.go index 73dc0c8..2d520cb 100644 --- a/main_test.go +++ b/main_test.go @@ -26,6 +26,7 @@ import ( "os" "os/exec" "path/filepath" + "regexp" "strings" "sync" "testing" @@ -301,3 +302,41 @@ func TestParseDate(t *testing.T) { } } } + +func TestParseErrors(t *testing.T) { + type testval struct { + src []byte + re *regexp.Regexp + n int + err error + } + + var tests = []testval{ + { + []byte("

Аэростат

"), + programNameRe, + 1, + nil, + }, { + []byte("

Аэростат

foo

"), + programNameRe, + 1, + nil, + }, { + []byte{}, + programNameRe, + 1, + errCantParse, + }, + } + + for _, test := range tests { + res, got := parse(test.src, test.re, test.n) + if test.err != got { + t.Error("for", test.src, test.re, test.n, "\nwant:", test.err, "got:", got) + } + if test.n != len(res) { + t.Error("for", test.src, test.re, test.n, "\nwant length:", test.n, "got:", len(res)) + } + } +} -- cgit v1.2.3