doc, err := html.Parse(resp.Body) defer resp.Body.Close() if err != nil { returnnil, fmt.Errorf("parsing %s as HTML: %v", url, err) }
var links []string visitNode := func(n *html.Node) { if n.Type == html.ElementNode && n.Data == "a" { for _, a := range n.Attr { if a.Key != "href" { continue } link, err := resp.Request.URL.Parse(a.Val) if err != nil { continue// ignore bad URLs } links = append(links, link.String()) } } } forEachNode(doc, visitNode, nil) //forEachNode(doc, startElement, endElement) return links, nil }
funcforEachNode(n *html.Node, pre, post func(n *html.Node)) { if pre != nil { pre(n) } for c := n.FirstChild; c != nil; c = c.NextSibling { forEachNode(c, pre, post) } if post != nil { post(n) } }
// crawl the web concurrently visited := make(map[string]bool) for ; n > 0; n-- { list := <-worklist for _, link := range list { if !visited[link] { visited[link] = true n++ gofunc(link string) { worklist <- crawl(link) }(link) } } } }
funcmain() { worklist := make(chan []string) // lists of URLs, may have duplicates unseenLinks := make(chanstring) // de-duplicated URLs // Add command-line arguments to worklist. gofunc() { worklist <- os.Args[1:] }() // Create 20 crawler goroutines to fetch each unseen link. for i := 0; i < 20; i++ { gofunc() { for link := range unseenLinks { foundLinks := crawl(link) gofunc() { worklist <- foundLinks }() } }() } // The main goroutine de-duplicates worklist items // and sends the unseen ones to the crawlers. seen := make(map[string]bool) for { select { case list := <-worklist { for _, link := range list { if !seen[link] { seen[link] = true unseenLinks <- link } } case <- time.After(3 * time.Second) fmt.Println("Exit, timeout") return } } }