bids: get a vote:comment ratio for scoring

* Ratio defaults to 1.0 on error
* Ratio bonus if a 0 comment article wins
* Remove buggy tests
* Add example scraping util for debugging
This commit is contained in:
Chris Sexton 2019-11-14 10:09:54 -05:00 committed by Chris Sexton
parent 30cd91079c
commit 905da629b9
5 changed files with 127 additions and 101 deletions

10
go.mod
View File

@ -9,6 +9,9 @@ require (
github.com/PaulRosset/go-hacknews v0.0.0-20170815075127-4aad99273a3c
github.com/PuerkitoBio/goquery v1.5.0
github.com/andybalholm/cascadia v1.1.0 // indirect
github.com/antchfx/htmlquery v1.2.0 // indirect
github.com/antchfx/xmlquery v1.2.0 // indirect
github.com/antchfx/xpath v1.1.1 // indirect
github.com/armon/go-radix v1.0.0 // indirect
github.com/azr/backoff v0.0.0-20160115115103-53511d3c7330 // indirect
github.com/chrissexton/gofuck v1.0.0
@ -17,12 +20,16 @@ require (
github.com/dustin/go-jsonpointer v0.0.0-20160814072949-ba0abeacc3dc // indirect
github.com/dustin/gojson v0.0.0-20160307161227-2e71ec9dd5ad // indirect
github.com/garyburd/go-oauth v0.0.0-20180319155456-bca2e7f09a17 // indirect
github.com/gobwas/glob v0.2.3 // indirect
github.com/gocolly/colly v1.2.0
github.com/golang/groupcache v0.0.0-20191027212112-611e8accdfc9 // indirect
github.com/gonum/floats v0.0.0-20181209220543-c233463c7e82 // indirect
github.com/gonum/internal v0.0.0-20181124074243-f884aa714029 // indirect
github.com/gorilla/websocket v1.4.1 // indirect
github.com/james-bowman/nlp v0.0.0-20191016091239-d9dbfaff30c6
github.com/james-bowman/sparse v0.0.0-20190423065201-80c6877364c7 // indirect
github.com/jmoiron/sqlx v1.2.0
github.com/kennygrant/sanitize v1.2.4 // indirect
github.com/mattn/go-sqlite3 v1.11.0
github.com/mmcdole/gofeed v1.0.0-beta2
github.com/mmcdole/goxpp v0.0.0-20181012175147-0068e33feabf // indirect
@ -30,14 +37,17 @@ require (
github.com/olebedev/when v0.0.0-20190311101825-c3b538a97254
github.com/robertkrimen/otto v0.0.0-20180617131154-15f95af6e78d // indirect
github.com/rs/zerolog v1.15.0
github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca // indirect
github.com/spaolacci/murmur3 v1.1.0 // indirect
github.com/stretchr/objx v0.2.0 // indirect
github.com/stretchr/testify v1.4.0
github.com/temoto/robotstxt v1.1.1 // indirect
github.com/velour/chat v0.0.0-20180713122344-fd1d1606cb89
github.com/velour/velour v0.0.0-20160303155839-8e090e68d158
golang.org/x/exp v0.0.0-20191014171548-69215a2ee97e // indirect
golang.org/x/net v0.0.0-20191014212845-da9a3fd4c582 // indirect
gonum.org/v1/gonum v0.6.0 // indirect
google.golang.org/appengine v1.6.5 // indirect
gopkg.in/go-playground/webhooks.v5 v5.13.0
gopkg.in/sourcemap.v1 v1.0.5 // indirect
gopkg.in/yaml.v2 v2.2.4 // indirect

22
go.sum
View File

@ -17,6 +17,12 @@ github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRy
github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo=
github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
github.com/antchfx/htmlquery v1.2.0 h1:oKShnsGlnOHX6t4uj5OHgLKkABcJoqnXpqnscoi9Lpw=
github.com/antchfx/htmlquery v1.2.0/go.mod h1:MS9yksVSQXls00iXkiMqXr0J+umL/AmxXKuP28SUJM8=
github.com/antchfx/xmlquery v1.2.0 h1:1nrzsSN5mFrlqFWSK9byiq/qXKE7O2vivYzhv1Ksnfw=
github.com/antchfx/xmlquery v1.2.0/go.mod h1:/+CnyD/DzHRnv2eRxrVbieRU/FIF6N0C+7oTtyUtCKk=
github.com/antchfx/xpath v1.1.1 h1:mqGYmd5pioPu06+REIf8j3y6O3S1UpVNVoCameZHotg=
github.com/antchfx/xpath v1.1.1/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk=
github.com/armon/go-radix v1.0.0 h1:F4z6KzEeeQIMeLFa97iZU6vupzoecKdU5TX24SNppXI=
github.com/armon/go-radix v1.0.0/go.mod h1:ufUuZ+zHj4x4TnLV4JWEpy2hxWSpsRywHrMgIH9cCH8=
github.com/azr/backoff v0.0.0-20160115115103-53511d3c7330 h1:ekDALXAVvY/Ub1UtNta3inKQwZ/jMB/zpOtD8rAYh78=
@ -44,7 +50,15 @@ github.com/garyburd/go-oauth v0.0.0-20180319155456-bca2e7f09a17 h1:GOfMz6cRgTJ9j
github.com/garyburd/go-oauth v0.0.0-20180319155456-bca2e7f09a17/go.mod h1:HfkOCN6fkKKaPSAeNq/er3xObxTW4VLeY6UUK895gLQ=
github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU=
github.com/go-sql-driver/mysql v1.4.0/go.mod h1:zAC/RDZ24gD3HViQzih4MyKcchzm+sOG5ZlKdlhCg5w=
github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y=
github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8=
github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI=
github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA=
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
github.com/golang/groupcache v0.0.0-20191027212112-611e8accdfc9 h1:uHTyIjqVhYRhLbJ8nIiOJHkEZZ+5YoOsAbD3sk82NiE=
github.com/golang/groupcache v0.0.0-20191027212112-611e8accdfc9/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
github.com/golang/protobuf v1.3.1 h1:YF8+flBXS5eO826T4nzqPrxfhQThhXl0YzfuUPu4SBg=
github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/gonum/floats v0.0.0-20181209220543-c233463c7e82 h1:EvokxLQsaaQjcWVWSV38221VAK7qc2zhaO17bKys/18=
github.com/gonum/floats v0.0.0-20181209220543-c233463c7e82/go.mod h1:PxC8OnwL11+aosOB5+iEPoV3picfs8tUpkVd0pDo+Kg=
github.com/gonum/internal v0.0.0-20181124074243-f884aa714029 h1:8jtTdc+Nfj9AR+0soOeia9UZSvYBvETVHZrugUowJ7M=
@ -60,6 +74,8 @@ github.com/james-bowman/sparse v0.0.0-20190423065201-80c6877364c7/go.mod h1:G6Ec
github.com/jmoiron/sqlx v1.2.0 h1:41Ip0zITnmWNR/vHV+S4m+VoUivnWY5E4OJfLZjCJMA=
github.com/jmoiron/sqlx v1.2.0/go.mod h1:1FEQNm3xlJgrMD+FBdI9+xvCksHtbpVBBw5dYhBSsks=
github.com/jung-kurt/gofpdf v1.0.3-0.20190309125859-24315acbbda5/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes=
github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o=
github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak=
github.com/lib/pq v1.0.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo=
github.com/mattn/go-sqlite3 v1.9.0/go.mod h1:FPy6KqzDD04eiIsT53CuJW3U88zkxoIYsOqkbpncsNc=
github.com/mattn/go-sqlite3 v1.11.0 h1:LDdKkqtYlom37fkvqs8rMPFKAMe8+SgjbwZ6ex1/A/Q=
@ -82,6 +98,8 @@ github.com/robertkrimen/otto v0.0.0-20180617131154-15f95af6e78d/go.mod h1:xvqspo
github.com/rs/xid v1.2.1/go.mod h1:+uKXf+4Djp6Md1KODXJxgGQPKngRmWyn10oCKFzNHOQ=
github.com/rs/zerolog v1.15.0 h1:uPRuwkWF4J6fGsJ2R0Gn2jB1EQiav9k3S6CSdygQJXY=
github.com/rs/zerolog v1.15.0/go.mod h1:xYTKnLHcpfU2225ny5qZjxnj9NvkumZYjJHlAThCjNc=
github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca h1:NugYot0LIVPxTvN8n+Kvkn6TrbMyxQiuvKdEwFdR9vI=
github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU=
github.com/spaolacci/murmur3 v1.1.0 h1:7c1g84S4BPRrfL5Xrdp6fOJ206sU9y293DDHaoy0bLI=
github.com/spaolacci/murmur3 v1.1.0/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA=
github.com/stretchr/objx v0.1.0 h1:4G4v2dO3VZwixGIRoQ5Lfboy6nUhCyYzaqnIAPPhYs4=
@ -92,6 +110,8 @@ github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXf
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk=
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
github.com/temoto/robotstxt v1.1.1 h1:Gh8RCs8ouX3hRSxxK7B1mO5RFByQ4CmJZDwgom++JaA=
github.com/temoto/robotstxt v1.1.1/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo=
github.com/velour/chat v0.0.0-20180713122344-fd1d1606cb89 h1:3D3M900hEBJJAqyKl70QuRHi5weX9+ptlQI1v+FNcQ8=
github.com/velour/chat v0.0.0-20180713122344-fd1d1606cb89/go.mod h1:ejwOYCjnDMyO5LXFXRARQJGBZ6xQJZ3rgAHE5drSuMM=
github.com/velour/velour v0.0.0-20160303155839-8e090e68d158 h1:p3rTUXxzuKsBOsHlkly7+rj9wagFBKeIsCDKkDII9sw=
@ -115,6 +135,7 @@ golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73r
golang.org/x/net v0.0.0-20190311183353-d8887717615a h1:oWX7TPOiFAMXLq8o0ikBYfCJVlRHBcsciT5bXOrH628=
golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859 h1:R/3boaszxrf1GEUWTVDzSKVwLmSJpwZ1yqXm8j0v2QI=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20191014212845-da9a3fd4c582 h1:p9xBe/w/OzkeYVKm234g55gMdD1nSIooTir5kV11kfA=
@ -137,6 +158,7 @@ gonum.org/v1/gonum v0.6.0 h1:DJy6UzXbahnGUf1ujUNkh/NEtK14qMo2nvlBPs4U5yw=
gonum.org/v1/gonum v0.6.0/go.mod h1:9mxDZsDKxgMAuccQkewq682L+0eCu4dCN2yonUJTCLU=
gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw=
gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc=
google.golang.org/appengine v1.6.5/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/go-playground/webhooks.v5 v5.13.0 h1:e9vtkQZK464+UdL3YjRox2yR8JSmh2094PUBMvdriFs=
gopkg.in/go-playground/webhooks.v5 v5.13.0/go.mod h1:LZbya/qLVdbqDR1aKrGuWV6qbia2zCYSR5dpom2SInQ=

View File

@ -3,11 +3,15 @@ package webshit
import (
"bytes"
"fmt"
"math"
"net/http"
"net/url"
"strconv"
"strings"
"time"
"github.com/gocolly/colly"
hacknews "github.com/PaulRosset/go-hacknews"
"github.com/PuerkitoBio/goquery"
"github.com/jmoiron/sqlx"
@ -184,14 +188,19 @@ func (w *Webshit) checkBids(bids []Bid, storyMap map[string]Story) []WeeklyResul
rec.WinningArticles = append(rec.WinningArticles, s)
totalWinning += float64(b.Bid)
} else {
rec.LosingArticles = append(rec.LosingArticles, Story{b.Title, b.URL})
rec.LosingArticles = append(rec.LosingArticles, Story{Title: b.Title, URL: b.URL})
}
total += float64(b.Bid)
wr[b.User] = rec
}
for _, b := range wins {
payout := float64(b.Bid) / totalWinning * total
score, comments, err := scrapeScoreAndComments(b.URL)
ratio := 1.0
if err != nil {
ratio = float64(score) / math.Max(float64(comments), 1.0)
}
payout := float64(b.Bid) / totalWinning * total * ratio
rec := wr[b.User]
rec.Won += int(payout)
rec.Score += int(payout)
@ -201,6 +210,41 @@ func (w *Webshit) checkBids(bids []Bid, storyMap map[string]Story) []WeeklyResul
return wrMapToSlice(wr)
}
func scrapeScoreAndComments(url string) (int, int, error) {
c := colly.NewCollector()
// why do I need this to break out of these stupid callbacks?
c.Async = true
finished := make(chan bool)
score := 0
comments := 0
var err error = nil
c.OnHTML("td.subtext > span.score", func(r *colly.HTMLElement) {
score, _ = strconv.Atoi(strings.Fields(r.Text)[0])
})
c.OnHTML("td.subtext > a[href*='item?id=']:last-of-type", func(r *colly.HTMLElement) {
comments, _ = strconv.Atoi(strings.Fields(r.Text)[0])
})
c.OnScraped(func(r *colly.Response) {
finished <- true
})
c.OnError(func(r *colly.Response, e error) {
log.Error().Err(err).Msgf("could not scrape %s", r.Request.URL)
err = e
finished <- true
})
c.Visit(url)
<-finished
return score, comments, err
}
// GetHeadlines will return the current possible news headlines for bidding
func (w *Webshit) GetHeadlines() ([]Story, error) {
news := hacknews.Initializer{Story: w.config.HNFeed, NbPosts: w.config.HNLimit}

View File

@ -1,99 +0,0 @@
package webshit
import (
"github.com/jmoiron/sqlx"
"github.com/rs/zerolog"
"github.com/rs/zerolog/log"
"github.com/stretchr/testify/assert"
"os"
"testing"
_ "github.com/mattn/go-sqlite3"
)
func init() {
log.Logger = log.Logger.Output(zerolog.ConsoleWriter{Out: os.Stderr})
}
func makeWS(t *testing.T) *Webshit {
db := sqlx.MustOpen("sqlite3", "file::memory:?mode=memory&cache=shared")
w := New(db)
assert.Equal(t, w.db, db)
return w
}
func TestWebshit_GetWeekly(t *testing.T) {
w := makeWS(t)
weekly, pub, err := w.GetWeekly()
t.Logf("Pub: %v", pub)
assert.NotNil(t, pub)
assert.Nil(t, err)
assert.NotEmpty(t, weekly)
}
func TestWebshit_GetHeadlines(t *testing.T) {
w := makeWS(t)
headlines, err := w.GetHeadlines()
assert.Nil(t, err)
assert.NotEmpty(t, headlines)
}
func TestWebshit_getStoryByURL(t *testing.T) {
w := makeWS(t)
expected := "Developer Tropes: “Google Does It”"
s, err := w.getStoryByURL("https://news.ycombinator.com/item?id=20432887")
assert.Nil(t, err)
assert.Equal(t, s.Title, expected)
}
func TestWebshit_getStoryByURL_BadURL(t *testing.T) {
w := makeWS(t)
_, err := w.getStoryByURL("https://google.com")
assert.Error(t, err)
}
func TestWebshit_GetBalance(t *testing.T) {
w := makeWS(t)
expected := 100
actual := w.GetBalance("foo")
assert.Equal(t, expected, actual)
}
func TestWebshit_checkBids(t *testing.T) {
w := makeWS(t)
bids := []Bid{
Bid{User: "foo", Title: "bar", URL: "https://baz/?id=1", Bid: 10},
Bid{User: "foo", Title: "bar2", URL: "http://baz/?id=2", Bid: 10},
}
storyMap := map[string]Story{
"1": Story{Title: "bar", URL: "http://baz/?id=1"},
}
result := w.checkBids(bids, storyMap)
assert.Len(t, result, 1)
if len(result) > 0 {
assert.Len(t, result[0].WinningArticles, 1)
assert.Len(t, result[0].LosingArticles, 1)
}
}
func TestWebshit_33PcWinner(t *testing.T) {
w := makeWS(t)
bids := []Bid{
Bid{User: "foo", Title: "bar", URL: "https://baz/?id=1", Bid: 10},
Bid{User: "foo", Title: "bar2", URL: "http://baz/?id=2", Bid: 10},
Bid{User: "bar", Title: "bar", URL: "http://baz/?id=1", Bid: 5},
}
storyMap := map[string]Story{
"1": Story{Title: "bar", URL: "http://baz/?id=1"},
}
result := w.checkBids(bids, storyMap)
assert.Len(t, result, 2)
if len(result) > 0 {
assert.Len(t, result[0].WinningArticles, 1)
assert.Len(t, result[0].LosingArticles, 1)
assert.Len(t, result[1].WinningArticles, 1)
assert.Len(t, result[1].LosingArticles, 0)
assert.Equal(t, result[0].Won, 16)
assert.Equal(t, result[1].Won, 8)
}
}

49
util/testhn/main.go Normal file
View File

@ -0,0 +1,49 @@
package main
import (
"flag"
"fmt"
"strconv"
"strings"
"github.com/gocolly/colly"
)
var url = flag.String("url", "https://news.ycombinator.com/item?id=21530860", "URL to scrape")
func main() {
flag.Parse()
//scrapeScoreAndComments(*url, func(score, comments int) {
// fmt.Printf("Finished scraping %s\nScore: %d, Comments: %d\n",
// *url, score, comments)
//})
score, comments := scrapeScoreAndComments(*url)
fmt.Printf("Finished scraping %s\nScore: %d, Comments: %d\n",
*url, score, comments)
}
func scrapeScoreAndComments(url string) (int, int) {
c := colly.NewCollector()
c.Async = true
finished := make(chan bool)
score := 0
comments := 0
c.OnHTML("td.subtext > span.score", func(r *colly.HTMLElement) {
score, _ = strconv.Atoi(strings.Fields(r.Text)[0])
})
c.OnHTML("td.subtext > a[href*='item?id=']:last-of-type", func(r *colly.HTMLElement) {
comments, _ = strconv.Atoi(strings.Fields(r.Text)[0])
})
c.OnScraped(func(r *colly.Response) {
finished <- true
})
c.Visit(url)
<-finished
return score, comments
}