#!/usr/bin/env python

"""
Writer.py   $Id: Writer.py,v 1.23 2001/11/14 00:50:38 janssen Exp $

Write a collection of documents into various formats.

Currently implemented: the traditional Plucker cache format.


Copyright 1999,2000 by Holger Duerer <holly@starship.python.net>

Distributable under the GNU General Public License Version 2 or newer.
"""


import os, struct, string, time, helper.PQAAppInfo, sys, urllib
import PyPlucker
from PyPlucker import Url, PluckerDocs
from PyPlucker.helper import prc, dict
from PyPlucker.helper.CharsetMapping import charset_mibenum_to_name


class Mapper:

     """This class handles all the mappings from URLs and PluckerDocs.PluckerDocument instances
     to record-IDs.  It contains a method "get_docs" which returns all the PluckerDocument
     instances it knows about; it contains a method "build_links" which returns a sequence of
     all the URLs in record-ID order, with zero-length URLs for unused record-IDs.  It
     contains a method "print_mapping" which sends a display of the mapping to stderr.
     Finally, it contains a method "get_or_add", which takes either a URL or a PluckerDocument
     instance, and returns its record-ID."""

     def __init__ (self, collection, alias_list):

 	# maintains a mapping of URLs to PluckerDocs.PluckerDocument instances.
 	# Keys are either a string URL, in which case the value is just a single instance,
 	# or a (url, fragment-id) pair, in which case the value is a (doc-instance, paragraph-number) pair.
 	self._url_to_doc_mapping = {}
 	for (url, doc) in collection.items():
	    self._url_to_doc_mapping[url] = doc
	    # check for internal fragment names in the page
	    name_mapping = isinstance(doc, PluckerDocs.PluckerTextDocument) and doc.get_name_map()
	    if name_mapping:
		for (name, (internalurl, paragraph_number)) in name_mapping.items():
		    internalurl = alias_list.get(internalurl, internalurl)
		    doc = collection.get(internalurl)
		    self._url_to_doc_mapping[(url, name)] = (doc, paragraph_number)

 	# a list of URL->URL mappings
 	self._alias_list = alias_list or {}

 	# Maps a PluckerDocs.PluckerDocument instance to a record ID.
 	self._doc_to_id_mapping = {}

	# Maps straight URLs without documents to a record ID.
	# Used mainly for external links.
	self._url_to_id_mapping = {}

 	self._current_id = 11		# first record ID issued.  Records 1-10 are reserved.

	# make sure record number 2 goes to the 'home' document (why?)
	url = self._alias_list.get('plucker:/home.html')
	if url:
	    doc = self._url_to_doc_mapping.get(url)
	    if doc:
		self._doc_to_id_mapping[doc] = 2
	    else:
		self._url_to_id_mapping[url] = 2

	# finally, make sure each doc has an ID assigned
 	for (url, doc) in collection.items():
	    self._get_id_for_doc(doc)


     def _get_id_for_doc(self, idoc, add=1):
	 if type(idoc) == type(()):
	     doc = idoc[0]
	 else:
	     doc = idoc
	 id = self._doc_to_id_mapping.get(doc)
	 if not id:
	     if not add:
		 return None
	     if isinstance(doc, PluckerDocs.PluckerIndexDocument):
		 # there's only one, and it always has record # 1
		 id = 1
	     elif isinstance(doc, PluckerDocs.PluckerLinkIndexDocument):
		 id = 3
	     elif isinstance(doc, PluckerDocs.PluckerCategoryDocument):
		 id = 4
	     elif isinstance(doc, PluckerDocs.PluckerMetadataDocument):
		 id = 5
	     else:
		 id = self._current_id
		 self._current_id = self._current_id + 1
	     self._doc_to_id_mapping[doc] = id
	     # sys.stderr.write("new document " + str(doc) + " => " + str(id) + "\n")
	 if type(idoc) == type(()):
	     return (id, idoc[1])
	 else:
	     return id


     def _get_id_for_url (self, url, add=1):
	 doc = self._url_to_doc_mapping.get(url)
	 id = doc and self._get_id_for_doc(doc, add)
	 id = id or self._url_to_id_mapping.get(url)
	 if not id:
	     # possibly valid main part, but invalid tag.  Return ID for main part in that case.
	     if type(url) == type(()):
		 id = self._get_id_for_url(url[0], 0)
	     if not id and add:
		 # OK, no ID, but we should assign one
		 id = self._current_id
		 self._current_id = self._current_id + 1
		 self._url_to_id_mapping[url] = id
		 # sys.stderr.write("** Gave ID %s to url %s\n" % (id, url))
	 return id


     def get_or_add (self, url_or_doc):
 	# For a standard URL, returns the numeric record ID.
 	# For a URL which has a fragment-id:
 	#   If the fragment is a paragraph of a text page, a pair (record-id, paragraph-id) is returned.
 	#   Otherwise, just the record id is returned.
	# If arg is PluckerDocument, returns the id assigned for that document.
	# If arg is integer, treats it as a registered-document id.  Get-only.
 	if type(url_or_doc) == type(''):
 	    import urllib
 	    url, tag = urllib.splittag(url_or_doc)
 	    finalurl = self._alias_list.get(url, url)
 	    if tag:
		id = self._get_id_for_url((finalurl, tag))
 	    else:
		id = self._get_id_for_url(finalurl)
	    return id
	elif isinstance(url_or_doc, PluckerDocs.PluckerDocument):
	    url = url_or_doc.get_url()
	    if not self._url_to_doc_mapping.has_key(url):
		self._url_to_doc_mapping[url] = url_or_doc
	    if not self._doc_to_id_mapping.has_key(url_or_doc) and self._url_to_id_mapping.has_key(url):
		self._doc_to_id_mapping[url_or_doc] = self._url_to_id_mapping[url]
	    return self._get_id_for_doc(url_or_doc)
	else:
	    raise ValueError, "not a URL or an instance of " + str(PluckerDocs.PluckerDocument)


     def build_links (self):
	 # build and return a list of the URL strings for all IDs used
	 key_dict = self._url_to_doc_mapping.copy()
	 key_dict.update(self._url_to_id_mapping)
	 # build a list of all URLs and associated IDs
	 for key in key_dict.keys():
	     if type(key) == type('') and len(key) > 7 and key[:7] == 'mailto:':
		 del key_dict[key]
		 continue
	     if type(key) == type(()):
		 # either resolved tag, in which case value is tuple,
		 # or unresolved tag, in which case value is integer
		 value = key_dict[key]
		 del key_dict[key]
		 if type(value) == type(()):
		     # truncate key to just plain record
		     key = key[0]
		     value = value[0]
		 else:
		     key = key[0] + '#' + key[1]
		 key_dict[key] = value
	     if isinstance(key_dict[key], PluckerDocs.PluckerDocument):
		 key_dict[key] = self._get_id_for_doc(key_dict[key])
	 # invert the dictionary
	 for item in key_dict.items():
	     del key_dict[item[0]]
	     key_dict[item[1]] = item[0]
	 # build up the list of URLs
	 urls = []
	 for i in range(self._current_id):
	     urls.append(key_dict.get(i) or '')
	 urls[1] = ''	# no URL needed for index record
	 return urls


     def get_docs(self):
	 # return a list of all the PluckerDocuments known to the mapper
	 return self._doc_to_id_mapping.keys()


     def print_mapping(self):
	 # print a list of all the URL's and associated IDs
	 sys.stderr.write('*********\n')
	 for (url, doc) in self._url_to_doc_mapping.items():
	     if type(doc) == type(()):
		 urlname = url[0] + '#' + url[1]
		 id = self._doc_to_id_mapping[doc[0]]
		 sys.stderr.write('%70s => %3d (%d)\n' % (url, id, doc[1]))
	     else:
		 id = self._doc_to_id_mapping[doc]
		 sys.stderr.write('%70s => %3d\n' % (url, id))
	 sys.stderr.write('*********\n')



class Writer:
    """Abstract base class from which to derive the various writers
    for documents"""

    def __init__ (self, collection, config, urlmapper=None):
        self._collection = collection
        self._config = config
	self._mapper = urlmapper


    def save_data (self, data, url, id, verbose):
        """This needs to be implemented in the derived class to
        actually output the 'data' (human readably denoted as
        'url') as something with id 'id'."""
        raise NotImplementedError, "PyPlucker.Writer.Writer.save_doc()"


    def _write_doc (self, out_dict, pluckerdoc, url, id, verbose):

	def _print_convert_msg (url, verbose):
	    if verbose > 1:
		urltext = str (url)
		if len (urltext) > 60:
		    urltext = urltext[:40] + "....." + urltext[-15:]
		print "Converted %s" % urltext

	if id != self._mapper.get_or_add(pluckerdoc):
	    raise ValueError, "bad id %d instead of %d" % (id, self._mapper.get_or_add(pluckerdoc))
        if pluckerdoc.is_text_document ():
	    dumps = pluckerdoc.dump_record_with_splits (self._mapper)
            for dump in dumps:
                (the_url, the_id, dump) = dump
                if the_id == 0:
                    the_id = id # original
                out_dict [the_id] = (dump, the_url, the_id, verbose)
		_print_convert_msg(the_url, verbose)
            return
        else:
            dump = pluckerdoc.dump_record (id)
	    out_dict [id] = (dump, url, id, verbose)
	    _print_convert_msg(url, verbose)

    
    def write (self, verbose, alias_list=None):
        """Write out the collection.  Returns the mapping that was
        used to generate the ids."""

	self._mapper = Mapper(self._collection, alias_list.as_dict())

	# figure default charset
	mibenum = self._config.get_int('default_charset', 0) or None
	charsets = {}

        out_dict = {}
	for pluckerdoc in self._mapper.get_docs():
	    id = self._mapper.get_or_add(pluckerdoc)
            if pluckerdoc.is_text_document ():
		pluckerdoc.resolve_ids (self._mapper)
		doc_mibenum = pluckerdoc.get_charset()
		if verbose > 2:
		    charset_name = charset_mibenum_to_name(doc_mibenum)
		    sys.stderr.write(pluckerdoc.get_url() + ' has charset ' + str(doc_mibenum) + ((charset_name and " (" + charset_name + ")") or "") + "\n")
		if charsets.has_key(doc_mibenum):
		    charsets[doc_mibenum].append(id)
		else:
		    charsets[doc_mibenum] = [id]
            self._write_doc (out_dict, pluckerdoc, pluckerdoc.get_url(), id, verbose)

        ## Do some error checking
        if not out_dict.has_key (2):
            raise RuntimeError, "The collection process failed to generate a 'home' document"
        
	## set up the metadata mapping, if any
	metadata = {}
	# set the default to the charset which has the 'most' pages
	items = charsets.items()
	if len(items) > 0:	# have to allow for image-only document
	    items.sort(lambda x, y: ((len(x[1]) < len(y[1]) and 1) or ((len(x[1]) > len(y[1])) and -1) or 0))
	    mibenum = items[0][0]
	    odd_charsets = []
	    if len(items) > 1:
		for item in items[1:]:
		    for id in item[1]:
			odd_charsets.append((id, item[0] or 0,))
	else:
	    mibenum = None
	    odd_charsets = []
	if mibenum != None:
	    metadata['CharSet'] = mibenum
	    if verbose > 1:
		# this bit of hair looks through a list of (name, number) tuples for the number
		# matching the mibenum, and returns the name, through flexible use of Python's
		# 'reduce' (great way of boiling a list to a single value) and boolean expressions
		charset_name = charset_mibenum_to_name(mibenum)
		print 'Default charset is MIBenum ' + str(mibenum) + ((charset_name and " (" + charset_name + ")") or "")
	else:
	    if verbose > 1:
		print 'No default charset'
	if len(odd_charsets) > 0:
	    metadata['ExceptionalCharSets'] = odd_charsets
	    if verbose > 1:
		sys.stderr.write("ExceptionalCharSets is " + str(odd_charsets) + "\n")

	## write the index record
        tmp_url = "plucker:/~special~/index"
        type = PluckerDocs.PluckerIndexDocument (tmp_url, self._config, metadata)
        self._write_doc (out_dict, type, tmp_url, 1, verbose)

	## write the URL information, if desired
        if not self._config.get_bool ('no_url_info', 0):
	    links = self._mapper.build_links()
	    linksdocs = []
	    for i in range(1, len(links), 200):
		tmp_url = "plucker:/~special~/links" + str(i)
		linksdoc = PluckerDocs.PluckerLinksDocument(tmp_url, links, i)
		self._mapper.get_or_add(linksdoc)
		linksdocs.append(linksdoc)
	    # now make links index
            tmp_url = "plucker:/~special~/pluckerlinks"
	    indexdoc = PluckerDocs.PluckerLinkIndexDocument(tmp_url, linksdocs, self._mapper)
	    self._mapper.get_or_add(indexdoc)
	    # OK, write the links index document
            self._write_doc (out_dict, indexdoc, tmp_url, 3, verbose)
	    # and write the various links documents
	    for doc in linksdocs:
                self._write_doc (out_dict, doc, doc.get_url(), self._mapper.get_or_add(doc), verbose)

	## write the category information, if present
        if self._config.get_string ('category') is not None:
            tmp_url = "plucker:/~special~/category"
            type = PluckerDocs.PluckerCategoryDocument (tmp_url, self._config)
            self._write_doc (out_dict, type, tmp_url, 4, verbose)

	## write the metadata record, if any
	if metadata:
            tmp_url = "plucker:/~special~/metadata"
	    type = PluckerDocs.PluckerMetadataDocument (tmp_url, metadata)
            self._write_doc (out_dict, type, tmp_url, 5, verbose)

        ## write out the special base record last.
        if not self._config.get_bool ('no_base', 0):
            tmp_url="plucker:/~special~/base"
            base=str(self._config.get_string ('home_url'))
            type=PluckerDocs.PluckerBaseDocument(tmp_url,base)
            self._write_doc(out_dict,type,tmp_url,self._mapper.get_or_add(type),verbose)

        ## now write everything else
        the_ids = out_dict.keys ()
        the_ids.sort ()  # they are numeric, so sort does the right thing
        for id in the_ids:
            dump, the_url, the_id, verbose = out_dict[id]
            self.save_data (dump, the_url, the_id, verbose)
            if verbose:
                urltext = str (the_url)
                if len (urltext) > 60:
                    urltext = urltext[:40] + "....." + urltext[-15:]
                print "Wrote %d <= %s" % (the_id, urltext)

        return self._mapper



class CacheWriter (Writer):
    """A Writer that writes the traditional format of a separate files
    in a cache directory"""

    def __init__ (self, collection, config, cachedir):
        Writer.__init__ (self, collection, config)
        self._cachedir = cachedir


    def write (self, verbose, alias_list):
        cachedir = os.path.expandvars (self._cachedir)
        cachedir = os.path.expanduser (cachedir)
        if not os.path.exists (cachedir):
            print "%s does not exists!" % cachedir
            return
        if not os.path.isdir (cachedir):
            print "%s is not a directory" % cachedir
            return

        # clear the cache directory
        for name in os.listdir (cachedir):
            fname = os.path.join (cachedir, name)
            if os.path.isfile (fname):
                os.unlink (fname)

        # Now call the super class to do the actual work
        return Writer.write (self, verbose, alias_list=alias_list)
        

    def save_data (self, data, url, id, verbose):
        filename = os.path.join (self._cachedir, "%d" % id)
        file = open (filename, "wb")
        file.write (data)
        file.close ()


class PDBWriter (Writer):
    """A Writer that writes the items into a ready-to-synch PDB
    file."""

    def __init__ (self, collection, config, name, version, filename):
        Writer.__init__ (self, collection, config)
        self._filename = filename
        self._dbname = name
        self._dbversion = version
        self._pdb_file = None
        self._flag_copy_prevention = config.get_bool ('copyprevention_bit')
        self._flag_launchableData = config.get_bool ('launchable_bit')
        self._flag_backup = config.get_bool ('backup_bit')
        self._icon = config.get_bool ('icon') or config.get_bool('launchable_bit')
        self._big_icon = config.get_string ('big_icon','')
        self._small_icon = config.get_string ('small_icon','')
        self._config = config


    def write (self, verbose, alias_list, mapping=None):
        if os.path.exists (self._filename):
            os.unlink (self._filename)
	if self._filename == '<stdout>':
	    if sys.platform == "win32":
		import msvcrt
		msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
	    self._pdb_file = prc.File (sys.stdout, read=0, write=1)
	else:
	    self._pdb_file = prc.File (self._filename, read=0, write=1)
        info = self._pdb_file.getDBInfo ()
        info['name'] = self._dbname
        info['version'] = self._dbversion
        info['creator'] = 'Plkr'
        info['type'] = 'Data'
        info['createDate'] = int (time.time())
        info['modifyDate'] = info['createDate']
        info['backupDate'] = -2082844800L
        info['flagCopyPrevention'] = self._flag_copy_prevention
        info['flagLaunchableData'] = self._flag_launchableData
        info['flagBackup'] = self._flag_backup
        if self._icon:
            self._pdb_file.setAppBlock( \
                helper.PQAAppInfo.pqa_app_info_block(self._config, \
                                                     self._dbname, \
                                                     self._dbversion, \
                                                     self._big_icon, \
                                                     self._small_icon))
        self._pdb_file.setDBInfo (info)

        # Now call the super class to do the actual work
        result = Writer.write (self, verbose, alias_list=alias_list)

        self._pdb_file.close ()
        return result
        

    def save_data (self, data, url, id, verbose):
        assert self._pdb_file is not None, "write_doc called with unintialized pdb file"

        self._pdb_file.setRecord (attr=0, id=id, cat=0, data=data)




class DictWriter (Writer):
    """A Writer that writes each record into a passed dictionary with
    the record number as the key"""

    def __init__ (self, collection, config, dict):
        Writer.__init__ (self, collection, config)
        self._dict = dict


    def save_data (self, data, url, id, verbose):
        self._dict[id] = data
