exclude another collection from the feed

rajbot · rajbot · commit 9b778be41933 · 2011-12-05T19:25:07.000Z
diff --git a/opds.py b/opds.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python2.5
+#!/usr/bin/env python
 
 #Copyright(c)2009 Internet Archive. Software license GPL version 3.
 
@@ -30,7 +30,7 @@
     'url_base'   : '/catalog',
     'urnroot'    : 'urn:x-internet-archive:bookserver:catalog',
     'solr_base'  : 'http://se.us.archive.org:8983/solr/select?fl=identifier,title,creator,publicdate,date,contributor,publisher,subject,language,format,month&wt=json',
-    'query_base' : 'format%3Aabbyy+AND+format%3Ascandata+AND+format%3Apdf+AND+NOT+ocr%3A%22language+not%22+AND+NOT+collection%3Alendinglibrary+AND+NOT+collection%3Aopensource+AND+NOT+collection%3Aprintdisabled'
+    'query_base' : 'format%3Aabbyy+AND+format%3Ascandata+AND+format%3Apdf+AND+NOT+ocr%3A%22language+not%22+AND+NOT+collection%3Alendinglibrary+AND+NOT+collection%3Aopensource+AND+NOT+collection%3Aprintdisabled+AND+NOT+collection%3Arosettaproject'
 }
 
 urls = (
@@ -44,7 +44,7 @@
     '/search(.*)',                  'htmlsearch',
     '/crawlable(?:/(.*))?(|.html)', 'crawlable',
     '/(|index.html)',               'index',
-    '/(.*)',                        'indexRedirect',        
+    '/(.*)',                        'indexRedirect',
     )
 
 application = web.application(urls, globals()).wsgifunc()
@@ -57,17 +57,17 @@ def getDateString():
     #IA is continuously scanning books. Since this OPDS file is constructed
     #from search engine results, let's change the updated date every midnight
     t       = time.gmtime()
-    datestr = time.strftime('%Y-%m-%dT%H:%M:%SZ', 
+    datestr = time.strftime('%Y-%m-%dT%H:%M:%SZ',
                 (t.tm_year, t.tm_mon, t.tm_mday, 0, 0, 0, 0, 0, 0))
     return datestr
-    
+
 def getEnv(key, default = None):
     env = web.ctx['environ']
     if env.has_key(key):
         return env[key]
     else:
         return default
-        
+
 def getDevice():
     userAgent = getEnv('HTTP_USER_AGENT')
     if userAgent is not None:
@@ -85,7 +85,7 @@ def GET(self, url):
             mode = 'html'
 
         datestr = getDateString()
-        
+
         c = catalog.Catalog(
                             title     = 'Internet Archive Catalog',
                             urn       = pubInfo['urnroot'],
@@ -108,46 +108,46 @@ def GET(self, url):
                      'new': 'new'
             }
             type = 'application/atom+xml'
-            
+
         l = catalog.Link(url = links['alpha'], type = type)
         e = catalog.Entry({'title'  : 'Alphabetical By Title',
                            'urn'     : pubInfo['urnroot'] + ':titles:all',
                            'updated' : datestr,
                            'content' : 'Alphabetical list of all titles.'
                          }, links=(l,))
         c.addEntry(e)
-        
+
         l = catalog.Link(url = links['downloads'], type = type)
         e = catalog.Entry({'title'   : 'Most Downloaded Books',
                            'urn'     : pubInfo['urnroot'] + ':downloads',
                            'updated' : datestr,
                            'content' : 'The most downloaded books from the Internet Archive in the last month.'
                          }, links=(l,))
-        
+
         c.addEntry(e)
 
-        l = catalog.Link(url = links['new'], type = type)        
+        l = catalog.Link(url = links['new'], type = type)
         e = catalog.Entry({'title'   : 'Recent Scans',
                            'urn'     : pubInfo['urnroot'] + ':new',
                            'updated' : datestr,
                            'content' : 'Books most recently scanned by the Internet Archive.'
                          }, links=(l,))
-        
+
         c.addEntry(e)
-        
+
         osDescriptionDoc = 'http://bookserver.archive.org/catalog/opensearch.xml'
         o = catalog.OpenSearch(osDescriptionDoc)
         c.addOpenSearch(o)
-        
+
         if url and url.endswith('.html'):
             r = output.ArchiveCatalogToHtml(c, device = getDevice())
             web.header('Content-Type', 'text/html')
             return r.toString()
-        else:        
+        else:
             r = output.CatalogToAtom(c)
             web.header('Content-Type', pubInfo['mimetype'])
             return r.toString()
-                
+
 
 # /alpha/a/0
 #______________________________________________________________________________
@@ -162,7 +162,7 @@ def GET(self, letter, start):
                 start = start[:-5]
                 mode = 'html'
             start = int(start)
-                               
+
         solrUrl       = pubInfo['solr_base']+'&q='+pubInfo['query_base']+'+AND+firstTitle%3A'+letter.upper()+'&sort=titleSorter+asc&rows='+str(numRows)+'&start='+str(start*numRows)
         titleFragment = 'books starting with "%s"' % (letter.upper())
         urn           = pubInfo['urnroot'] + ':%s:%d'%(letter, start)
@@ -172,7 +172,7 @@ def GET(self, letter, start):
                                                 urlBase='/catalog/alpha/%s/' % (letter),
                                                 titleFragment = titleFragment)
         c = ingestor.getCatalog()
-    
+
         if 'html' == mode:
             web.header('Content-Type', 'text/html')
             r = output.ArchiveCatalogToHtml(c, device = getDevice())
@@ -181,7 +181,7 @@ def GET(self, letter, start):
             web.header('Content-Type', pubInfo['mimetype'])
             r = output.CatalogToAtom(c, fabricateContentElement=True)
             return r.toString()
-        
+
 # /alpha.xml
 #______________________________________________________________________________
 class alphaList:
@@ -197,7 +197,7 @@ def GET(self, extension):
         #TODO: create a version of /alpha.xml with the correct updated dates,
         #and cache it for an hour to ease load on solr
         datestr = getDateString()
-        
+
         c = catalog.Catalog(
                             title     = 'Internet Archive - All Titles',
                             urn       = pubInfo['urnroot'] + ':titles:all',
@@ -217,8 +217,8 @@ def GET(self, extension):
                 linkType = 'application/atom+xml'
             else:
                 raise ValueError('Unsupported extension %s' % extension)
-                
-            l = catalog.Link(url = self.alphaURL(extension, lower, 0), type = linkType)        
+
+            l = catalog.Link(url = self.alphaURL(extension, lower, 0), type = linkType)
             e = catalog.Entry({'title'   : 'Titles: ' + letter,
                                'urn'     : pubInfo['urnroot'] + ':titles:'+lower,
                                'updated' : datestr,
@@ -229,7 +229,7 @@ def GET(self, extension):
         osDescriptionDoc = 'http://bookserver.archive.org/catalog/opensearch.xml'
         o = catalog.OpenSearch(osDescriptionDoc)
         c.addOpenSearch(o)
-        
+
         if ('xml' == extension):
             web.header('Content-Type', pubInfo['mimetype'])
             r = output.CatalogToAtom(c)
@@ -249,7 +249,7 @@ def GET(self, extension):
         urn           = pubInfo['urnroot'] + ':downloads'
         ingestor = catalog.ingest.IASolrToCatalog(pubInfo, solrUrl, urn, titleFragment=titleFragment)
         c = ingestor.getCatalog()
-        
+
         if ('xml' == extension):
             web.header('Content-Type', pubInfo['mimetype'])
             r = output.CatalogToAtom(c, fabricateContentElement=True)
@@ -269,16 +269,16 @@ def GET(self, start, extension):
             extension = 'html'
         else:
             extension = 'xml'
-        
+
         if not start:
             start = 0
         else:
             if start.endswith('.html'):
                 extension = 'html'
                 start = start[:-5]
             start = int(start)
-        
-                               
+
+
         solrUrl       = pubInfo['solr_base'] + '&q='+pubInfo['query_base']+'&sort=publicdate+desc&rows='+str(numRows)+'&start='+str(start*numRows)
         titleFragment = 'books sorted by update date'
         urn           = pubInfo['urnroot'] + ':new:%d' % (start)
@@ -287,7 +287,7 @@ def GET(self, start, extension):
                                                 urlBase='/catalog/new/',
                                                 titleFragment = titleFragment)
         c = ingestor.getCatalog()
-    
+
         if 'html' == extension:
             web.header('Content-Type', 'text/html')
             r = output.ArchiveCatalogToHtml(c, device = getDevice())
@@ -305,15 +305,15 @@ def GET(self, start, extension):
             extension = 'html'
         else:
             extension = 'xml'
-        
+
         if not start:
             start = 0
         else:
             if start.endswith('.html'):
                 extension = 'html'
                 start = start[:-5]
             start = int(start)
-        
+
         crawlNumRows = 1000;
         solrUrl       = pubInfo['solr_base'] + '&q='+pubInfo['query_base']+'&rows='+str(crawlNumRows)+'&start='+str(start*crawlNumRows)
         titleFragment = '- crawlable feed'
@@ -323,7 +323,7 @@ def GET(self, start, extension):
                                                 urlBase='/catalog/crawlable/',
                                                 titleFragment = titleFragment)
         c = ingestor.getCatalog()
-    
+
         if 'html' == extension:
             web.header('Content-Type', 'text/html')
             r = output.ArchiveCatalogToHtml(c, device = getDevice())
@@ -335,7 +335,7 @@ def GET(self, start, extension):
 
 
 # /opensearch
-#______________________________________________________________________________        
+#______________________________________________________________________________
 class opensearch:
     def GET(self, query):
         params = cgi.parse_qs(web.ctx.query)
@@ -346,7 +346,7 @@ def GET(self, query):
             start = int(params['start'][0])
 
         q  = params['?q'][0]
-        qq = urllib.quote(q)     
+        qq = urllib.quote(q)
         solrUrl       = pubInfo['solr_base'] + '&q='+qq+'+AND+'+pubInfo['query_base']+'&sort=month+desc&rows='+str(numRows)+'&start='+str(start*numRows)
         titleFragment = 'search results for ' + q
         urn           = pubInfo['urnroot'] + ':search:%s:%d' % (qq, start)
@@ -361,15 +361,15 @@ def GET(self, query):
         web.header('Content-Type', pubInfo['mimetype'])
         r = output.CatalogToAtom(c, fabricateContentElement=True)
         return r.toString()
-        
+
 # /search
-#______________________________________________________________________________        
+#______________________________________________________________________________
 class htmlsearch:
     def GET(self, query):
         qs = web.ctx.query
         if qs.startswith('?'):
             qs = qs[1:]
-        
+
         params = cgi.parse_qs(qs)
 
         if not 'start' in params:
@@ -382,7 +382,7 @@ def GET(self, query):
 
         q  = params['q'][0]
         qq = urllib.quote(q)
-        solrUrl       = 'http://se.us.archive.org:8983/solr/select?q='+qq+'+AND+'+pubInfo['query_base']+'&fl=identifier,title,creator,oai_updatedate,date,contributor,publisher,subject,language,format&rows='+str(numRows)+'&start='+str(start*numRows)+'&wt=json'        
+        solrUrl       = 'http://se.us.archive.org:8983/solr/select?q='+qq+'+AND+'+pubInfo['query_base']+'&fl=identifier,title,creator,oai_updatedate,date,contributor,publisher,subject,language,format&rows='+str(numRows)+'&start='+str(start*numRows)+'&wt=json'
         titleFragment = 'search results for ' + q
         urn           = pubInfo['urnroot'] + ':search:%s:%d' % (qq, start)
 
@@ -392,40 +392,40 @@ def GET(self, query):
                                                 titleFragment = titleFragment)
 
         c = ingestor.getCatalog()
-        
+
         web.header('Content-Type', 'text/html')
         r = output.ArchiveCatalogToHtml(c, device = getDevice())
         return r.toString()
 
 # /opensearch.xml - Open Search Description
-#______________________________________________________________________________        
+#______________________________________________________________________________
 class openSearchDescription:
     def GET(self):
         web.header('Content-Type', 'application/atom+xml')
         return """<?xml version="1.0" encoding="UTF-8"?>
 <OpenSearchDescription xmlns="http://a9.com/-/spec/opensearch/1.1/">
     <ShortName>Internet Archive Search</ShortName>
     <Description>Search archive.org's OPDS Catalog.</Description>
-    <Url type="application/atom+xml" 
+    <Url type="application/atom+xml"
         template="%s/opensearch?q={searchTerms}&amp;start={startPage?}"/>
 </OpenSearchDescription>""" % (pubInfo['opdsroot'])
 
 
 # redirect to remove trailing slash
-#______________________________________________________________________________        
+#______________________________________________________________________________
 class redirect:
     def GET(self, path):
         web.seeother('/' + path)
 
 # redirect to index
-#______________________________________________________________________________        
+#______________________________________________________________________________
 class indexRedirect:
     def GET(self, path):
         web.seeother('/')
 
-        
+
 # main() - standalone mode
-#______________________________________________________________________________        
+#______________________________________________________________________________
 if __name__ == "__main__":
     #run in standalone mode
     app = web.application(urls, globals())