#!/usr/bin/env python # -*- coding: iso-8859-1 -*- # Dinko Korunic 'kreator', 2006. # name_index.py # script for generating name index from a given wordlist """This script can be used to: - generate a name index from a TXT file which was generated though pdftotext, - or index any other file where pages are split with formfeed [ASCII 0x0c] character. Usage: name_index.py original_text wordlist catalog - original text is the given text to index with each word from the given wordlist, - catalog will contain the the parsed output. Example: (1) pdftotext -enc UTF-8 original.pdf (2) python name_index.py original.txt wordlist.txt catalog.txt """ # History: # 1.2 - Throw out the map() in the favor of list comprehensions # 1.1 - Use local argv if possible, not only sys.argv # 1.0 - Initial release __copyright__ = """Copyright (C) 2005 Dinko Korunic, InfoMAR d.o.o. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ __version__ = '1.2' import sys def usage(): """Print program usage string and gracefully exit""" usage_text = """Usage: %s original_text wordlist catalog Original text is the given text to index with each word from the given wordlist. Catalog file will eventually contain the the parsed output, if sucessfully analysed. """ print usage_text % sys.argv[0] sys.exit(0) def load_index(wordlist_filename): """Load wordlist from file into an array""" # locals wordlist = [] try: wordlist_file = file(wordlist_filename, 'r') for line in wordlist_file: # add each word into the final wordlist words = line.split() wordlist.extend(words) wordlist_file.close() except IOError, (errno, strerror): print 'I/O error with %s(%s): %s' % \ (wordlist_filename, errno, strerror) sys.exit(1) return wordlist def index_file(wordlist, source_filename, pageno = 1): """Index whole file using wordlist and return an index""" # locals result = {} strip_chars = r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~""" try: source_file = file(source_filename, 'r') # read file in lines for line in source_file: if line.startswith('\f'): print 'Done with page %d' % pageno pageno += 1 # split lines in words and parse words = line.split() for word in words: # strip words of different \W* endings clean_word = word.strip(strip_chars) if clean_word in wordlist: # and eventually store them try: result[clean_word][pageno] = 0 except KeyError: result[clean_word] = {} result[clean_word][pageno] = 0 source_file.close() print 'Completed.' except IOError, (errno, strerror): print 'I/O error with %s(%s): %s' % \ (source_filename, errno, strerror) sys.exit(1) return result def print_results(result, catalog_filename): """Write out the results into the catalog file""" try: catalog_file = file(catalog_filename, 'w') # get all the keywords and sort them words = result.keys() words.sort() for word in words: # sort the pages, convert to string and write numarray = result[word].keys() numarray.sort() strarray = ' '.join([str(x) for x in numarray]) catalog_file.write('%s: %s\n' % (word, strarray)) catalog_file.close() except IOError, (errno, strerror): print 'I/O error with %s(%s): %s' % \ (catalog_filename, errno, strerror) sys.exit(1) def main(argv = None): """Call all necessary functions...""" # parse CLI arguments or arguments from main() if argv is None: argv = sys.argv # check if proper number of arguments used if len(argv) != 4: usage() # read index wordlist = load_index(argv[2]) # do the indexing result = index_file(wordlist, argv[1]) # and finally do the output print_results(result, argv[3]) if __name__ == '__main__': # import Psyco if available try: import psyco psyco.full() except ImportError: pass sys.exit(main())