Skip to content

Commit 6279415

Browse files
Merge pull request #10 from cyclone-github/dev
added -agent flag #8
2 parents 3636626 + 0c8e76e commit 6279415

File tree

5 files changed

+26
-38
lines changed

5 files changed

+26
-38
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
### v0.9.1
2+
```
3+
added flag "-agent" to allow user to specify custom user-agent; https://github.com/cyclone-github/spider/issues/8
4+
```
15
### v0.9.0
26
```
37
added flag "-url-match" to only crawl URLs containing a specified keyword; https://github.com/cyclone-github/spider/issues/6

README.md

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
# Spider: URL Mode
1010
```
11-
spider -url 'https://forum.hashpwn.net' -crawl 2 -delay 20 -sort -ngram 1-3 -timeout 1 -url-match wordlist -o forum.hashpwn.net_spider.txt
11+
spider -url 'https://forum.hashpwn.net' -crawl 2 -delay 20 -sort -ngram 1-3 -timeout 1 -url-match wordlist -o forum.hashpwn.net_spider.txt -agent 'foobar agent'
1212
```
1313
```
1414
----------------------
@@ -68,12 +68,16 @@ Wordlist & ngram creation tool to crawl a given url or process a local file to c
6868
- `spider -url 'https://github.com/cyclone-github' -ngram 1-3 -sort`
6969
- To filter crawled URLs by keyword "foobar"
7070
- `spider -url 'https://github.com/cyclone-github' -url-match foobar`
71+
- To specify a custom user-agent
72+
- `spider -url 'https://github.com/cyclone-github' -agent 'foobar'`
7173
- To process a local text file, create ngrams len 1-3 and sort output by frequency
7274
- `spider -file foobar.txt -ngram 1-3 -sort`
7375
- Run `spider -help` to see a list of all options
7476

7577
### spider -help
7678
```
79+
-agent string
80+
Custom user-agent (default "Spider/0.9.1 (+https://github.com/cyclone-github/spider)")
7781
-crawl int
7882
Depth of links to crawl (default 1)
7983
-cyclone
@@ -82,8 +86,6 @@ Wordlist & ngram creation tool to crawl a given url or process a local file to c
8286
Delay in ms between each URL lookup to avoid rate limiting (default 10)
8387
-file string
8488
Path to a local file to scrape
85-
-url-match string
86-
Only crawl URLs containing this keyword (case-insensitive)
8789
-ngram string
8890
Lengths of n-grams (e.g., "1-3" for 1, 2, and 3-length n-grams). (default "1")
8991
-o string
@@ -94,6 +96,8 @@ Wordlist & ngram creation tool to crawl a given url or process a local file to c
9496
Timeout for URL crawling in seconds (default 1)
9597
-url string
9698
URL of the website to scrape
99+
-url-match string
100+
Only crawl URLs containing this keyword (case-insensitive)
97101
-version
98102
Display version
99103
```

go.mod

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
module spider
22

3-
go 1.24.3
3+
go 1.24.5
44

55
require github.com/PuerkitoBio/goquery v1.10.3
66

77
require (
88
github.com/andybalholm/cascadia v1.3.3 // indirect
9-
golang.org/x/net v0.41.0 // indirect
9+
golang.org/x/net v0.42.0 // indirect
1010
)

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,8 @@ golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk=
2424
golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
2525
golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
2626
golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=
27-
golang.org/x/net v0.41.0 h1:vBTly1HeNPEn3wtREYfy4GZ/NECgw2Cnl+nK6Nz3uvw=
28-
golang.org/x/net v0.41.0/go.mod h1:B/K4NNqkfmg07DQYrbwvSluqCJOOXwUjeb/5lOisjbA=
27+
golang.org/x/net v0.42.0 h1:jzkYrhi3YQWD6MLBJcsklgQsoAcw89EcZbJw8Z614hs=
28+
golang.org/x/net v0.42.0/go.mod h1:FF1RA5d3u7nAYA4z2TkclSCKh68eSXtiFwcWQpPXdt8=
2929
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
3030
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
3131
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=

spider.go

Lines changed: 11 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -61,43 +61,22 @@ v0.9.0;
6161
fixed crawl-depth calculation logic
6262
fixed restrict link collection to .html, .htm, .txt and extension-less paths
6363
upgraded dependencies and bumped Go version to v1.24.3
64+
v0.9.1;
65+
added flag "-agent" to allow user to specify custom user-agent; https://github.com/cyclone-github/spider/issues/8
6466
6567
TODO:
6668
-plaintext (allow user to "copy / paste" webpage)
6769
-text-match (only process webpages whose text contains specified keyword — similar to -url-match, but matches webpage text instead)
6870
*/
6971

70-
// clear screen function
71-
/*
72-
func clearScreen() {
73-
var cmd *exec.Cmd
74-
75-
switch runtime.GOOS {
76-
case "linux", "darwin":
77-
cmd = exec.Command("clear")
78-
case "windows":
79-
cmd = exec.Command("cmd", "/c", "cls")
80-
default:
81-
fmt.Fprintln(os.Stderr, "Unsupported platform")
82-
os.Exit(1)
83-
}
84-
85-
cmd.Stdout = os.Stdout
86-
if err := cmd.Run(); err != nil {
87-
fmt.Fprintf(os.Stderr, "Failed to clear screen: %v\n", err)
88-
os.Exit(1)
89-
}
90-
}
91-
*/
92-
9372
// goquery
94-
func getDocumentFromURL(targetURL string, timeout time.Duration) (*goquery.Document, bool, error) {
73+
func getDocumentFromURL(targetURL string, timeout time.Duration, agent string) (*goquery.Document, bool, error) {
9574
client := &http.Client{Timeout: timeout}
9675
req, err := http.NewRequest("GET", targetURL, nil)
9776
if err != nil {
9877
return nil, false, err
9978
}
100-
req.Header.Set("User-Agent", "Spider/0.9.0 (+https://github.com/cyclone-github/spider)")
79+
req.Header.Set("User-Agent", agent)
10180

10281
res, err := client.Do(req)
10382
if err != nil {
@@ -153,13 +132,13 @@ func getTextFromDocument(doc *goquery.Document) string {
153132
return doc.Text()
154133
}
155134

156-
func crawlAndScrape(u string, depth int, delay int, timeout time.Duration, urlCountChan chan<- int, textsChan chan<- string, visited map[string]bool, urlMatchStr string) {
135+
func crawlAndScrape(u string, depth int, delay int, timeout time.Duration, agent string, urlCountChan chan<- int, textsChan chan<- string, visited map[string]bool, urlMatchStr string) {
157136
if visited[u] {
158137
return
159138
}
160139
visited[u] = true // mark before fetch to avoid retry on error
161140

162-
doc, isSuccess, err := getDocumentFromURL(u, timeout)
141+
doc, isSuccess, err := getDocumentFromURL(u, timeout, agent)
163142
if err != nil {
164143
fmt.Fprintf(os.Stderr, "Error fetching URL %s: %v\n", u, err)
165144
return
@@ -197,7 +176,7 @@ func crawlAndScrape(u string, depth int, delay int, timeout time.Duration, urlCo
197176
continue
198177
}
199178

200-
crawlAndScrape(link, depth-1, delay, timeout, urlCountChan, textsChan, visited, urlMatchStr)
179+
crawlAndScrape(link, depth-1, delay, timeout, agent, urlCountChan, textsChan, visited, urlMatchStr)
201180
}
202181
}
203182
}
@@ -272,6 +251,8 @@ func main() {
272251
timeoutFlag := flag.Int("timeout", 1, "Timeout for URL crawling in seconds")
273252
sortFlag := flag.Bool("sort", false, "Sort output by frequency")
274253
urlMatchFlag := flag.String("url-match", "", "Only crawl URLs containing this keyword (case-insensitive)")
254+
agentFlag := flag.String("agent", "Spider/0.9.1 (+https://github.com/cyclone-github/spider)", "Custom user-agent")
255+
275256
flag.Parse()
276257

277258
if *cycloneFlag {
@@ -281,7 +262,7 @@ func main() {
281262
os.Exit(0)
282263
}
283264
if *versionFlag {
284-
version := "Cyclone's URL Spider v0.9.0"
265+
version := "Cyclone's URL Spider v0.9.1"
285266
fmt.Fprintln(os.Stderr, version)
286267
os.Exit(0)
287268
}
@@ -427,7 +408,7 @@ func main() {
427408
wg.Add(1)
428409
go func() {
429410
defer wg.Done()
430-
crawlAndScrape(*urlFlag, *crawlFlag, *delayFlag, timeoutDur, urlCountChan, textsChan, visitedURLs, urlMatchStr)
411+
crawlAndScrape(*urlFlag, *crawlFlag, *delayFlag, timeoutDur, *agentFlag, urlCountChan, textsChan, visitedURLs, urlMatchStr)
431412
time.Sleep(100 * time.Millisecond)
432413
close(textsChan)
433414
close(doneChan)
@@ -443,7 +424,6 @@ func main() {
443424

444425
// if nothing matched, exit early
445426
if len(texts) == 0 {
446-
time.Sleep(100)
447427
fmt.Fprintln(os.Stderr, "No URLs crawled, exiting...") // boo, something went wrong!
448428
if *crawlFlag == 1 {
449429
fmt.Fprintln(os.Stderr, "Try increasing -crawl depth, or remove -url-match")

0 commit comments

Comments
 (0)