@@ -61,43 +61,22 @@ v0.9.0;
6161 fixed crawl-depth calculation logic
6262 fixed restrict link collection to .html, .htm, .txt and extension-less paths
6363 upgraded dependencies and bumped Go version to v1.24.3
64+ v0.9.1;
65+ added flag "-agent" to allow user to specify custom user-agent; https://github.com/cyclone-github/spider/issues/8
6466
6567TODO:
6668 -plaintext (allow user to "copy / paste" webpage)
6769 -text-match (only process webpages whose text contains specified keyword — similar to -url-match, but matches webpage text instead)
6870*/
6971
70- // clear screen function
71- /*
72- func clearScreen() {
73- var cmd *exec.Cmd
74-
75- switch runtime.GOOS {
76- case "linux", "darwin":
77- cmd = exec.Command("clear")
78- case "windows":
79- cmd = exec.Command("cmd", "/c", "cls")
80- default:
81- fmt.Fprintln(os.Stderr, "Unsupported platform")
82- os.Exit(1)
83- }
84-
85- cmd.Stdout = os.Stdout
86- if err := cmd.Run(); err != nil {
87- fmt.Fprintf(os.Stderr, "Failed to clear screen: %v\n", err)
88- os.Exit(1)
89- }
90- }
91- */
92-
9372// goquery
94- func getDocumentFromURL (targetURL string , timeout time.Duration ) (* goquery.Document , bool , error ) {
73+ func getDocumentFromURL (targetURL string , timeout time.Duration , agent string ) (* goquery.Document , bool , error ) {
9574 client := & http.Client {Timeout : timeout }
9675 req , err := http .NewRequest ("GET" , targetURL , nil )
9776 if err != nil {
9877 return nil , false , err
9978 }
100- req .Header .Set ("User-Agent" , "Spider/0.9.0 (+https://github.com/cyclone-github/spider)" )
79+ req .Header .Set ("User-Agent" , agent )
10180
10281 res , err := client .Do (req )
10382 if err != nil {
@@ -153,13 +132,13 @@ func getTextFromDocument(doc *goquery.Document) string {
153132 return doc .Text ()
154133}
155134
156- func crawlAndScrape (u string , depth int , delay int , timeout time.Duration , urlCountChan chan <- int , textsChan chan <- string , visited map [string ]bool , urlMatchStr string ) {
135+ func crawlAndScrape (u string , depth int , delay int , timeout time.Duration , agent string , urlCountChan chan <- int , textsChan chan <- string , visited map [string ]bool , urlMatchStr string ) {
157136 if visited [u ] {
158137 return
159138 }
160139 visited [u ] = true // mark before fetch to avoid retry on error
161140
162- doc , isSuccess , err := getDocumentFromURL (u , timeout )
141+ doc , isSuccess , err := getDocumentFromURL (u , timeout , agent )
163142 if err != nil {
164143 fmt .Fprintf (os .Stderr , "Error fetching URL %s: %v\n " , u , err )
165144 return
@@ -197,7 +176,7 @@ func crawlAndScrape(u string, depth int, delay int, timeout time.Duration, urlCo
197176 continue
198177 }
199178
200- crawlAndScrape (link , depth - 1 , delay , timeout , urlCountChan , textsChan , visited , urlMatchStr )
179+ crawlAndScrape (link , depth - 1 , delay , timeout , agent , urlCountChan , textsChan , visited , urlMatchStr )
201180 }
202181 }
203182}
@@ -272,6 +251,8 @@ func main() {
272251 timeoutFlag := flag .Int ("timeout" , 1 , "Timeout for URL crawling in seconds" )
273252 sortFlag := flag .Bool ("sort" , false , "Sort output by frequency" )
274253 urlMatchFlag := flag .String ("url-match" , "" , "Only crawl URLs containing this keyword (case-insensitive)" )
254+ agentFlag := flag .String ("agent" , "Spider/0.9.1 (+https://github.com/cyclone-github/spider)" , "Custom user-agent" )
255+
275256 flag .Parse ()
276257
277258 if * cycloneFlag {
@@ -281,7 +262,7 @@ func main() {
281262 os .Exit (0 )
282263 }
283264 if * versionFlag {
284- version := "Cyclone's URL Spider v0.9.0 "
265+ version := "Cyclone's URL Spider v0.9.1 "
285266 fmt .Fprintln (os .Stderr , version )
286267 os .Exit (0 )
287268 }
@@ -427,7 +408,7 @@ func main() {
427408 wg .Add (1 )
428409 go func () {
429410 defer wg .Done ()
430- crawlAndScrape (* urlFlag , * crawlFlag , * delayFlag , timeoutDur , urlCountChan , textsChan , visitedURLs , urlMatchStr )
411+ crawlAndScrape (* urlFlag , * crawlFlag , * delayFlag , timeoutDur , * agentFlag , urlCountChan , textsChan , visitedURLs , urlMatchStr )
431412 time .Sleep (100 * time .Millisecond )
432413 close (textsChan )
433414 close (doneChan )
@@ -443,7 +424,6 @@ func main() {
443424
444425 // if nothing matched, exit early
445426 if len (texts ) == 0 {
446- time .Sleep (100 )
447427 fmt .Fprintln (os .Stderr , "No URLs crawled, exiting..." ) // boo, something went wrong!
448428 if * crawlFlag == 1 {
449429 fmt .Fprintln (os .Stderr , "Try increasing -crawl depth, or remove -url-match" )
0 commit comments