1- #!/usr/bin/python2.5
1+ #!/usr/bin/env python
22
33#Copyright(c)2009 Internet Archive. Software license GPL version 3.
44
3030 'url_base' : '/catalog' ,
3131 'urnroot' : 'urn:x-internet-archive:bookserver:catalog' ,
3232 'solr_base' : 'http://se.us.archive.org:8983/solr/select?fl=identifier,title,creator,publicdate,date,contributor,publisher,subject,language,format,month&wt=json' ,
33- 'query_base' : 'format%3Aabbyy+AND+format%3Ascandata+AND+format%3Apdf+AND+NOT+ocr%3A%22language+not%22+AND+NOT+collection%3Alendinglibrary+AND+NOT+collection%3Aopensource+AND+NOT+collection%3Aprintdisabled'
33+ 'query_base' : 'format%3Aabbyy+AND+format%3Ascandata+AND+format%3Apdf+AND+NOT+ocr%3A%22language+not%22+AND+NOT+collection%3Alendinglibrary+AND+NOT+collection%3Aopensource+AND+NOT+collection%3Aprintdisabled+AND+NOT+collection%3Arosettaproject '
3434}
3535
3636urls = (
4444 '/search(.*)' , 'htmlsearch' ,
4545 '/crawlable(?:/(.*))?(|.html)' , 'crawlable' ,
4646 '/(|index.html)' , 'index' ,
47- '/(.*)' , 'indexRedirect' ,
47+ '/(.*)' , 'indexRedirect' ,
4848 )
4949
5050application = web .application (urls , globals ()).wsgifunc ()
@@ -57,17 +57,17 @@ def getDateString():
5757 #IA is continuously scanning books. Since this OPDS file is constructed
5858 #from search engine results, let's change the updated date every midnight
5959 t = time .gmtime ()
60- datestr = time .strftime ('%Y-%m-%dT%H:%M:%SZ' ,
60+ datestr = time .strftime ('%Y-%m-%dT%H:%M:%SZ' ,
6161 (t .tm_year , t .tm_mon , t .tm_mday , 0 , 0 , 0 , 0 , 0 , 0 ))
6262 return datestr
63-
63+
6464def getEnv (key , default = None ):
6565 env = web .ctx ['environ' ]
6666 if env .has_key (key ):
6767 return env [key ]
6868 else :
6969 return default
70-
70+
7171def getDevice ():
7272 userAgent = getEnv ('HTTP_USER_AGENT' )
7373 if userAgent is not None :
@@ -85,7 +85,7 @@ def GET(self, url):
8585 mode = 'html'
8686
8787 datestr = getDateString ()
88-
88+
8989 c = catalog .Catalog (
9090 title = 'Internet Archive Catalog' ,
9191 urn = pubInfo ['urnroot' ],
@@ -108,46 +108,46 @@ def GET(self, url):
108108 'new' : 'new'
109109 }
110110 type = 'application/atom+xml'
111-
111+
112112 l = catalog .Link (url = links ['alpha' ], type = type )
113113 e = catalog .Entry ({'title' : 'Alphabetical By Title' ,
114114 'urn' : pubInfo ['urnroot' ] + ':titles:all' ,
115115 'updated' : datestr ,
116116 'content' : 'Alphabetical list of all titles.'
117117 }, links = (l ,))
118118 c .addEntry (e )
119-
119+
120120 l = catalog .Link (url = links ['downloads' ], type = type )
121121 e = catalog .Entry ({'title' : 'Most Downloaded Books' ,
122122 'urn' : pubInfo ['urnroot' ] + ':downloads' ,
123123 'updated' : datestr ,
124124 'content' : 'The most downloaded books from the Internet Archive in the last month.'
125125 }, links = (l ,))
126-
126+
127127 c .addEntry (e )
128128
129- l = catalog .Link (url = links ['new' ], type = type )
129+ l = catalog .Link (url = links ['new' ], type = type )
130130 e = catalog .Entry ({'title' : 'Recent Scans' ,
131131 'urn' : pubInfo ['urnroot' ] + ':new' ,
132132 'updated' : datestr ,
133133 'content' : 'Books most recently scanned by the Internet Archive.'
134134 }, links = (l ,))
135-
135+
136136 c .addEntry (e )
137-
137+
138138 osDescriptionDoc = 'http://bookserver.archive.org/catalog/opensearch.xml'
139139 o = catalog .OpenSearch (osDescriptionDoc )
140140 c .addOpenSearch (o )
141-
141+
142142 if url and url .endswith ('.html' ):
143143 r = output .ArchiveCatalogToHtml (c , device = getDevice ())
144144 web .header ('Content-Type' , 'text/html' )
145145 return r .toString ()
146- else :
146+ else :
147147 r = output .CatalogToAtom (c )
148148 web .header ('Content-Type' , pubInfo ['mimetype' ])
149149 return r .toString ()
150-
150+
151151
152152# /alpha/a/0
153153#______________________________________________________________________________
@@ -162,7 +162,7 @@ def GET(self, letter, start):
162162 start = start [:- 5 ]
163163 mode = 'html'
164164 start = int (start )
165-
165+
166166 solrUrl = pubInfo ['solr_base' ]+ '&q=' + pubInfo ['query_base' ]+ '+AND+firstTitle%3A' + letter .upper ()+ '&sort=titleSorter+asc&rows=' + str (numRows )+ '&start=' + str (start * numRows )
167167 titleFragment = 'books starting with "%s"' % (letter .upper ())
168168 urn = pubInfo ['urnroot' ] + ':%s:%d' % (letter , start )
@@ -172,7 +172,7 @@ def GET(self, letter, start):
172172 urlBase = '/catalog/alpha/%s/' % (letter ),
173173 titleFragment = titleFragment )
174174 c = ingestor .getCatalog ()
175-
175+
176176 if 'html' == mode :
177177 web .header ('Content-Type' , 'text/html' )
178178 r = output .ArchiveCatalogToHtml (c , device = getDevice ())
@@ -181,7 +181,7 @@ def GET(self, letter, start):
181181 web .header ('Content-Type' , pubInfo ['mimetype' ])
182182 r = output .CatalogToAtom (c , fabricateContentElement = True )
183183 return r .toString ()
184-
184+
185185# /alpha.xml
186186#______________________________________________________________________________
187187class alphaList :
@@ -197,7 +197,7 @@ def GET(self, extension):
197197 #TODO: create a version of /alpha.xml with the correct updated dates,
198198 #and cache it for an hour to ease load on solr
199199 datestr = getDateString ()
200-
200+
201201 c = catalog .Catalog (
202202 title = 'Internet Archive - All Titles' ,
203203 urn = pubInfo ['urnroot' ] + ':titles:all' ,
@@ -217,8 +217,8 @@ def GET(self, extension):
217217 linkType = 'application/atom+xml'
218218 else :
219219 raise ValueError ('Unsupported extension %s' % extension )
220-
221- l = catalog .Link (url = self .alphaURL (extension , lower , 0 ), type = linkType )
220+
221+ l = catalog .Link (url = self .alphaURL (extension , lower , 0 ), type = linkType )
222222 e = catalog .Entry ({'title' : 'Titles: ' + letter ,
223223 'urn' : pubInfo ['urnroot' ] + ':titles:' + lower ,
224224 'updated' : datestr ,
@@ -229,7 +229,7 @@ def GET(self, extension):
229229 osDescriptionDoc = 'http://bookserver.archive.org/catalog/opensearch.xml'
230230 o = catalog .OpenSearch (osDescriptionDoc )
231231 c .addOpenSearch (o )
232-
232+
233233 if ('xml' == extension ):
234234 web .header ('Content-Type' , pubInfo ['mimetype' ])
235235 r = output .CatalogToAtom (c )
@@ -249,7 +249,7 @@ def GET(self, extension):
249249 urn = pubInfo ['urnroot' ] + ':downloads'
250250 ingestor = catalog .ingest .IASolrToCatalog (pubInfo , solrUrl , urn , titleFragment = titleFragment )
251251 c = ingestor .getCatalog ()
252-
252+
253253 if ('xml' == extension ):
254254 web .header ('Content-Type' , pubInfo ['mimetype' ])
255255 r = output .CatalogToAtom (c , fabricateContentElement = True )
@@ -269,16 +269,16 @@ def GET(self, start, extension):
269269 extension = 'html'
270270 else :
271271 extension = 'xml'
272-
272+
273273 if not start :
274274 start = 0
275275 else :
276276 if start .endswith ('.html' ):
277277 extension = 'html'
278278 start = start [:- 5 ]
279279 start = int (start )
280-
281-
280+
281+
282282 solrUrl = pubInfo ['solr_base' ] + '&q=' + pubInfo ['query_base' ]+ '&sort=publicdate+desc&rows=' + str (numRows )+ '&start=' + str (start * numRows )
283283 titleFragment = 'books sorted by update date'
284284 urn = pubInfo ['urnroot' ] + ':new:%d' % (start )
@@ -287,7 +287,7 @@ def GET(self, start, extension):
287287 urlBase = '/catalog/new/' ,
288288 titleFragment = titleFragment )
289289 c = ingestor .getCatalog ()
290-
290+
291291 if 'html' == extension :
292292 web .header ('Content-Type' , 'text/html' )
293293 r = output .ArchiveCatalogToHtml (c , device = getDevice ())
@@ -305,15 +305,15 @@ def GET(self, start, extension):
305305 extension = 'html'
306306 else :
307307 extension = 'xml'
308-
308+
309309 if not start :
310310 start = 0
311311 else :
312312 if start .endswith ('.html' ):
313313 extension = 'html'
314314 start = start [:- 5 ]
315315 start = int (start )
316-
316+
317317 crawlNumRows = 1000 ;
318318 solrUrl = pubInfo ['solr_base' ] + '&q=' + pubInfo ['query_base' ]+ '&rows=' + str (crawlNumRows )+ '&start=' + str (start * crawlNumRows )
319319 titleFragment = '- crawlable feed'
@@ -323,7 +323,7 @@ def GET(self, start, extension):
323323 urlBase = '/catalog/crawlable/' ,
324324 titleFragment = titleFragment )
325325 c = ingestor .getCatalog ()
326-
326+
327327 if 'html' == extension :
328328 web .header ('Content-Type' , 'text/html' )
329329 r = output .ArchiveCatalogToHtml (c , device = getDevice ())
@@ -335,7 +335,7 @@ def GET(self, start, extension):
335335
336336
337337# /opensearch
338- #______________________________________________________________________________
338+ #______________________________________________________________________________
339339class opensearch :
340340 def GET (self , query ):
341341 params = cgi .parse_qs (web .ctx .query )
@@ -346,7 +346,7 @@ def GET(self, query):
346346 start = int (params ['start' ][0 ])
347347
348348 q = params ['?q' ][0 ]
349- qq = urllib .quote (q )
349+ qq = urllib .quote (q )
350350 solrUrl = pubInfo ['solr_base' ] + '&q=' + qq + '+AND+' + pubInfo ['query_base' ]+ '&sort=month+desc&rows=' + str (numRows )+ '&start=' + str (start * numRows )
351351 titleFragment = 'search results for ' + q
352352 urn = pubInfo ['urnroot' ] + ':search:%s:%d' % (qq , start )
@@ -361,15 +361,15 @@ def GET(self, query):
361361 web .header ('Content-Type' , pubInfo ['mimetype' ])
362362 r = output .CatalogToAtom (c , fabricateContentElement = True )
363363 return r .toString ()
364-
364+
365365# /search
366- #______________________________________________________________________________
366+ #______________________________________________________________________________
367367class htmlsearch :
368368 def GET (self , query ):
369369 qs = web .ctx .query
370370 if qs .startswith ('?' ):
371371 qs = qs [1 :]
372-
372+
373373 params = cgi .parse_qs (qs )
374374
375375 if not 'start' in params :
@@ -382,7 +382,7 @@ def GET(self, query):
382382
383383 q = params ['q' ][0 ]
384384 qq = urllib .quote (q )
385- solrUrl = 'http://se.us.archive.org:8983/solr/select?q=' + qq + '+AND+' + pubInfo ['query_base' ]+ '&fl=identifier,title,creator,oai_updatedate,date,contributor,publisher,subject,language,format&rows=' + str (numRows )+ '&start=' + str (start * numRows )+ '&wt=json'
385+ solrUrl = 'http://se.us.archive.org:8983/solr/select?q=' + qq + '+AND+' + pubInfo ['query_base' ]+ '&fl=identifier,title,creator,oai_updatedate,date,contributor,publisher,subject,language,format&rows=' + str (numRows )+ '&start=' + str (start * numRows )+ '&wt=json'
386386 titleFragment = 'search results for ' + q
387387 urn = pubInfo ['urnroot' ] + ':search:%s:%d' % (qq , start )
388388
@@ -392,40 +392,40 @@ def GET(self, query):
392392 titleFragment = titleFragment )
393393
394394 c = ingestor .getCatalog ()
395-
395+
396396 web .header ('Content-Type' , 'text/html' )
397397 r = output .ArchiveCatalogToHtml (c , device = getDevice ())
398398 return r .toString ()
399399
400400# /opensearch.xml - Open Search Description
401- #______________________________________________________________________________
401+ #______________________________________________________________________________
402402class openSearchDescription :
403403 def GET (self ):
404404 web .header ('Content-Type' , 'application/atom+xml' )
405405 return """<?xml version="1.0" encoding="UTF-8"?>
406406<OpenSearchDescription xmlns="http://a9.com/-/spec/opensearch/1.1/">
407407 <ShortName>Internet Archive Search</ShortName>
408408 <Description>Search archive.org's OPDS Catalog.</Description>
409- <Url type="application/atom+xml"
409+ <Url type="application/atom+xml"
410410 template="%s/opensearch?q={searchTerms}&start={startPage?}"/>
411411</OpenSearchDescription>""" % (pubInfo ['opdsroot' ])
412412
413413
414414# redirect to remove trailing slash
415- #______________________________________________________________________________
415+ #______________________________________________________________________________
416416class redirect :
417417 def GET (self , path ):
418418 web .seeother ('/' + path )
419419
420420# redirect to index
421- #______________________________________________________________________________
421+ #______________________________________________________________________________
422422class indexRedirect :
423423 def GET (self , path ):
424424 web .seeother ('/' )
425425
426-
426+
427427# main() - standalone mode
428- #______________________________________________________________________________
428+ #______________________________________________________________________________
429429if __name__ == "__main__" :
430430 #run in standalone mode
431431 app = web .application (urls , globals ())
0 commit comments