"""
This checks for bad or incomplete signatures.
"""

PYTHONBREAKBOINT = 0 #disallow breakpoints

from collections import Counter
from unicode import latin1_to_ascii
import json
import pdb
import re
import os
import sys
import time
import glob
import shutil
from pprint import pprint, pformat
import gzip
import csv
import platform
import inspect
from functools import partial
import os.path

#Globals

currOS = platform.platform()
if currOS.find('Windows') == 0:
    LOCALBASE = 'C:/website/'
    #LOCALBASE = 'A:/test/'
else:
    LOCALBASE = '/ssd/home/sarge/prog/python/fron'
HTTPBASE = "http://gerbode.net/"
CURRDIR = re.sub('\\\\+', '/', os.getcwd()) + '/'
FACHEAD = LOCALBASE + "facsimiles/"
instCount = 0   #count of new/incorrect instruments
partCount = 0   #count of items in partCount that are int in ensemble
typeCount = 0   #count of new/incorrect types
nameCount = 0   #count of new/incorrect proper names: composers, publishers, anthologists, etc.
facCount = 0    #count of unmatched facsimile strings
newFacs = 0 # Count of numatched missing facsimiles

def join_with_fslash(s1, s2):
    return(s1 + '/' + s2)
# end join_with_fslash

startTime = int(time.time())
#Data/input files contained in local website base
CONTRIBDIR = LOCALBASE + "contributors"
NAMESFILE = LOCALBASE + "namedata.tsv"
INSTSFILE = LOCALBASE + "instruments.tsv"
TYPEFILE = LOCALBASE + "types.tsv"
TEMPLATE = LOCALBASE + "templates/template.ft3" 
MISSINGPAGES = LOCALBASE + "missing_pages.txt"
NOFACDIRS = LOCALBASE + "noFacDirs.txt"
# Error/output files located in current directory
NEWFACDIRS = CURRDIR + "newFacDirs.txt"
NEWNAMES = CURRDIR + "newnames.txt"
# Nonexistent facsimiles in valid directories
FACERRS = CURRDIR + "facerrs.txt"
NEWTYPES = CURRDIR + "newtypes.txt"
NEWINSTS = CURRDIR + "newinsts.txt"
DERRFILE = CURRDIR + "dfterrs.txt"
CERRFILE = CURRDIR + "cfterrs.txt"
EERRFILE = CURRDIR + "efterrs.txt"
TERRFILE = CURRDIR + "tfterrs.txt"
TSVFILE = CURRDIR + "dft.tsv"
JSONFILE = CURRDIR + "dft.json"
# directories where we don't look for fronimo files
BADDIRS = ['midi', 'tabs', 'pdf', 'other', 'videos', 'TEMP', 'old_dft.pls',
'fronimo', 'images', 'icons', 'index_files', 'ftp', 'facsimiles',
'making_lute_music_accessible_files', 'contributors']
# For converting RTF files to latin1 and vice versa
RTFPREFIX = "{\\rtf1\\ansi\\ansicpg1252\\deff0\\deflang1033{\\fonttbl{\\f0\\fnil\\fcharset0 MS Shell\r\nDlg;}}\r\n\\viewkind4\\uc1\\pard\\f0\\fs-22 ";
RTFSUFFIX = "\\par\r\n}\r\n";
IMAGETYPES = ('.png', '.tif', '.pdf', '.jpg')

# Compiled regular expressions
#use re.DOTALL to match \n as well
# = number in a series; % = arbitrary number to make source unique
reUnicode = re.compile('(\\\\u.... )')
reSplitAndOr = re.compile('(..*) (?:and|or) (..*)')
reGetDate = re.compile('\D\((\>?c?a?\.? ?[1-2][0-9]{3})\)', re.DOTALL)
reApproxDate = re.compile('(ca?\.? ?)[1-2][0-9]{3}', re.DOTALL)
reParenContents = re.compile('^([^(]*)\(([^)]+)\)(.*)', re.DOTALL)
reEntabulated = re.compile('([IiEe]ntabulated)', re.DOTALL)
reEncoded = re.compile('[Ee](ncoded)', re.DOTALL)
reEdited = re.compile('[Ee](dited)', re.DOTALL)
reEnc = re.compile(r'[Ee](nc\.)', 1)
reEd = re.compile(r'[Ee](d\.)', 1)
reInfo = re.compile('^(...).*?: *(..*)$', re.DOTALL)
reKey = re.compile('^[A-G][b#]*[Mm]$',re.DOTALL)
reGetDictItem = re.compile('"([^"]*)"[^"]*"([^"]*)"')
reDeLang = re.compile(r'(\\lang[0-9]{4)}')

# These are for error messages
# for current func name, specify n = 0 or no argument.
# for name of caller of current func, specify 1.
# for name of caller of caller of current func, specify 2. etc.
funcName = lambda n=1: sys._getframe(n + 1).f_code.co_name
callerName = lambda n=2: sys._getframe(n + 1).f_code.co_name
lineNo = lambda n=1: sys._getframe(n + 1).f_lineno

# insert a string into another string at a specific location
def insert_str(stInsert, str, index):
    return str[:index] + strInsert + str[index:]

def at_eof(f):
    return f.tell() == os.fstat(f.fileno()).st_size

def show_tuple(t):
    for item in t:
        print("%s, " % item, end = "")
    print("")

difficulties = ["??", "Beginner", "Easy", "Medium", "Challenge", "Difficult", "Virtuoso"]

field_map = [
    ("Title", "title"),
    ("Subtitle", "subtitle"),
    ("Composer", "composer"),
    ("Orig. composer", "composer0"),
    ("Footnote", "footnote"),
    ("Source", "Source"),
    ("Document", "document"),
    ("Volume", "volume"),
    ("Date", "date"),
    ("Page", "page"),
    ("Editor", "editor"),
    ("Encoder", "encoder"),
    ("Arranger", "arranger"),
    ("Intabulator", "intabulator"),
    ("Concordances", "concordances"),
    ("Contributor", "contributor"),
    ("Info", "info"),
    ("Piece", "piece"),
    ("Section", "section"),
    ("Type", "type"),
    ("Key", "key"),
    ("Difficulty", "difficulty"),
    ("Ensemble", "ensemble"),
    ("Part", "part"),
    ("Remarks", "remarks"),
    ("Recording", "recurl"),
    ("Facsimile", "facurl"),
    ("Fronimo", "stFile"),
    ("PDF", "stPdf"),
    ("Midi", "stMidi"),
    ("Modified", "mtime"),
    ("Created", "ctime"),
    ]

key_order = [   # for dumping
    "title", "subtitle",
    "composer", "composer0",
    "footnote", "source", "document", "volume", "date", "page",
    "editor", "encoder", "arranger", "intabulator",
    "contributor", "concordances",
    "info", "piece", "section",
    "type", "key", "difficulty",
    "ensemble",
    "part",
    "remarks","recurl", "facurl", 
    "stFile", "stPdf", "stMidi",
    "ctime", "mtime",
    ]

#Creates fronimo object from file with name stIn
class Fronimo:
    def __init__(self, stIn = TEMPLATE):
        # set all attributes to empty string
        for col,att in field_map:
            setattr(self, att, "")
        self.base = LOCALBASE
        self.currProg = ''
        self.credits = ''
        self.begText = ''
        self.endText = ''
        self.performance = False
        self.simple = False
        self.ornamented = False
        self.footnote = ""
        #input file name set from argument stIn
        self.stFile = stIn
        self.stFron = ''
        self.flFron = None
        self.flOut = None
        self.flErr = None
        self.offset = None
        self.oldOffset = None
        self.volume = ""
        self.info = ""
        self.recurl = ""
        self.facurl = ""
        #keep track of padding of the last dir contacted that contains numerical pages
        #self.lastPageDir = ["", 0] 
        # Creates self.stFron from fronimo file: self.stFile
        if self.read_and_unzip_file() == False:
            # marker for failure of class instantiation
            self.start_offset = -1
        else:
            self.start_offset = self.get_start_offset()
            if self.start_offset > 0:
                # Fills in other values of fronimo object.
                if self.populate() == False:
                    self.start_offset = -1
# end of  __init___

    # Open error file
    @classmethod
    def open_error(kls, errfile):
        try:
            kls.flErr = open(errfile, "w", encoding = 'latin1')
        except OSError:
            print("OSError: Cannot open error File ", errfile,
            " for writing.", file=sys.stderr)
            return False
        except:
            print("Other error: Cannot open error File ", errfile,
            " for writing.", file=sys.stderr)
            return False
        return True
# End open_error

    @classmethod
    def open_file(kls, fl, mode):
        try:
            flName = open(fl, mode, encoding='latin1')
        except OSError:
            err= "OSError: Cannot open file %s in mode %s." % (fl,  mode)
            kls.print_error(fl, err)
            return None
        except:
            err= "Other error: Cannot open file %s in mode %s." % (fl,  mode)
            kls.print_error(fl, err)
            return None
        return flName
# end of openFile

# open and read all needed fronimo-related files
    @classmethod
    def open_files(kls):
        kls.nameList = []
        kls.typeList = []
        kls.instList = []
        kls.noFacList = []

        # Read in and process list of recognized names
        kls.flNames = kls.open_file(NAMESFILE, "r")
        if not kls.flNames:
            print("Cannot open NAMESFILE file.")
            return False
        stFacNames = kls.flNames.read()
        lsIn = stFacNames.split('\n')
        for line in lsIn:
            lsRec = line.split('\t')
            if line == '':
                continue
            lsRec = line.split('\t')
            # First, get the default directory name for this proper name source
            # That's capitalized last name + capitalized first letter of of first name
            # Get the first and last names from 2nd list record
            lastFirst = lsRec[1]
            if (',' in lastFirst):
                val = re.search('^([^,][^,]+), *(..*)', lastFirst)
                last = val.group(1)
                first = val.group(2)
                slug = last + first[0]
            else:
                # Unless there is no first name
                slug = lsRec[0]
            # Remove accents
            slug = latin1_to_ascii(slug)
            # Prepend it to the list record
            lsRec.insert(0, slug)
            # and add the record to the names list.
            kls.nameList.append(lsRec)

        # Read in instrument list
        kls.flInsts = kls.open_file(INSTSFILE, "r")
        if not kls.flInsts:
            print("Cannot open INSTSFILE file.")
            return False
        stInst = kls.flInsts.read()
        lsIn = stInst.split('\n')
        for line in lsIn:
            lsRec = line.split('\t')
            kls.instList.append(lsRec)

        # Read in types list
        kls.flTypes = kls.open_file(TYPEFILE, "r")
        if not kls.flTypes:
            print("Cannot open TYPEFILE file.")
            return False
        stType = kls.flTypes.read()
        lsIn = stType.split('\n')
        for line in lsIn:
            lsRec = line.split('\t')
            kls.typeList.append(lsRec)

        #Read in past references to nonexistent facsimiles
        #A time saver, but needs to be updated as facsimiles are added
        kls.flNoFacDirs = kls.open_file(NOFACDIRS, "r")
        if not kls.flNoFacDirs:
            print("Cannot open NOFACDIRS file.")
            return False
        # Read in the whole list as is
        stIn = kls.flNoFacDirs.read()
        kls.noFacList = stIn.split('\n')

        #Read in missing pages list
        #A time saver, but needs to be updated as facsimiles are added
        kls.flMissingPages = kls.open_file(MISSINGPAGES, "r")
        if not kls.flMissingPages:
            print("Cannot open MISSINGPAGES file.")
            return False
        # Read in the whole list as is
        stIn = kls.flMissingPages.read()
        kls.missingPageList = stIn.split('\n')

        kls.flNewFacDirs = kls.open_file(NEWFACDIRS, "w")
        if not kls.flNewFacDirs:
            print("Cannot open NEWFACDIRS file.")
            return False

        #Open error lists
        kls.flNewNames = kls.open_file(NEWNAMES, "w")
        if not kls.flNewNames:
            print("Cannot open NEWNAMES file.")
            return False
        kls.flNewTypes = kls.open_file(NEWTYPES, "w")
        if not kls.flNewTypes:
            print("Cannot open NEWTYPES file.")
            return False
        kls.flNewInsts = kls.open_file(NEWINSTS, "w")
        if not kls.flNewInsts:
            print("Cannot open NEWINSTS file.")
            return False
        kls.flFacErrs = kls.open_file(FACERRS, "w")
        if not kls.flFacErrs:
            print("Cannot open FACERRS file.")
            return False
        return True
# end open_files
    
    @classmethod
    def print_headers(kls):
        print("[", file=kls.flJson)
        print("Title\tSubtitle\tComposer\tOrig. composer\tSource\tDocument\tVolume\tDate\tPage\tEditor\tEncoder\tArranger\tIntabulator\tContributor\tConcordances\tPiece\tSection\tType\tKey\tDifficulty\tEnsemble\tPart\tRemarks\tRecording\tFacsimile\tFronimo\tPDF\tMidi\tCreated\tModified", file=kls.flTsv)
# end print_headers
    
# for future use
    @classmethod
    def make_contrib_dir(kls):
        s= self.contributor.casefold()
        s = s.replace('.', '')
        s - s.replace(' ', '_')
        self.contribDir = join_with_fslash(CONTRIBDIR, s)
        if not os.path.exists(self.contribDir):
            os.makedirs(self.contribDir)

    # Get canonical name for type
    @classmethod
    def find_canonical_type(kls,typ):
        typ = typ.strip()
        typ = typ.lower()
        for item in kls.typeList:
            if item[0] == typ:
                return item[1]
        return None
    #end find_canonical_type
        
    # Finds all types in a hierarchy below a given canonical type
    @classmethod
    def get_all_types(kls, typ):
        typOut = [typ]
        for t in kls.typeList:
            if len(t) == 3: #only look at items that have higher types
                # if there is an "and"
                tpH = t[2].split('&')
                # See if the search pattern matches one of the alternatives
                if typ in tpH:
                    newType = kls.get_all_types(t[1])
                    if newType:
                        # No duplications allowed
                        if not newType in typOut:
                            typOut = typOut + newType
        return typOut

    # Starts with a comma-separated list of types, validates them,
    #gets canonical name, # and returns the validated list with all sub-types
    @classmethod
    def get_type_list(kls, typesIn):
        typelist = typesIn.split(',')
        typesOut = []
        for typ in typelist:
            t = kls.find_canonical_type(typ)
            if t == None:
                print("Type %s not found." % typ)
#               Fronimo.print_error("Type %s not found." % typ)
                continue
            else:
                typesOut += kls.get_all_types(t)
        return(typesOut) 

# print helpful error message
    @classmethod
    def print_error(kls, currFile, errMsg):
        print("In %s; caller:%s; line:%d of %s\nfile:%s; %s." % (funcName(), callerName(), lineNo(), kls.currProg, currFile, errMsg), file=kls.flErr)
        kls.flErr.flush()

# correct all the files given in a json file
    @classmethod
    def correct_all(kls, f):
        count = 0
        line = f.readline()
        # Find and discard the '[' at the beginning of the file
        while not at_eof(f) and not line.find('[') in [0,1,2,3,4,5]:
            line = f.readline()
        while not at_eof(f) and not line.find(']') in [0,1,2,3,4,5]:
            # Find and discard { at beginning of json record
            while not at_eof(f) and not line.find('{') in [0,1,2,3,4,5]:
                line = f.readline()
            if at_eof(f):
                return(count)
            dChanges = {}
            # Read the json record up to the "}" and load dChanges
            while not at_eof(f) and not line.find('}') in [0,1,2,3,4,5]:
                # Find the first quote
                while not at_eof(f) and not line.find('"') in [0,1,2,3,4,5]:
                    line = f.readline()
                if at_eof(f):
                    return(count)
                line = line.replace('\\"', '\xA4')
                item = reGetDictItem.search(line)
                if item == None:
                    Fronimo.print_error(kls.stFile, "cannot parse json line, %s.", line)
                    continue
                typ = item.group(1)
                val = item.group(2).replace('\xA4','"')
                dChanges[typ] = val
                line = f.readline()
            # Load fron file corresponding to this record
            fronFile = dChanges['stFile']
            if not os.path.isfile(fronFile):
                msg = "File %s in dft.json does not exist"  % (fronFile)
                print(msg, file=kls.flErr)
                continue
            fron = Fronimo(fronFile)
            #fron.stFron now created from this fronimo file.
            # and fronimo object fron populated with values from that file.
            # punt if corresponding file not found
            if fron.start_offset == -1:
                continue
            # Save old fronimo string.
            stOldFron = fron.stFron
            # Load changes from json record into fron, replacing old values
            fron.load_changes(dChanges)
            if not fron.depopulate():
                Fronimo.print_error(kls.stFile,"Cannot update fronimo file string.")
                continue
            # if no changes to make, do nothing.
            if stOldFron == fron.stFron:
                continue
            if not fron.write_file(False):
                Fronimo.print_error(kls.stFile, "Cannot write out fronimo file.")
                continue
            count += 1
            line = f.readline()
        return count
#end correct_all

#print missing facsimile pages
    def no_fac_page(self, location):
        global newFacs
        # No point in printing out facsimile directory location
        location = location.replace(FACHEAD, '')
        # Print it out if not already known to be missing
        if not location in self.missingPageList:
            newFacs +=1
            ft3_file = self.stFile.replace(LOCALBASE, '')
            ft3_file = ft3_file.replace('composers', 'cmps')
            ft3_file = ft3_file.replace('sources', 'srcs')
            msg = ft3_file + ' ||| ' + location
            print(msg, file=Fronimo.flFacErrs)
#End of no_fac

# Get starting offset for reading/writing a fronimo file
    def get_start_offset(self):
        if (self.stFron[4] == '\x15') or (self.stFron[4] == '\x14'):
            return 364
        elif self.stFron[4] == '\x16':
            return 368
        else:
            val = hex(ord(self.stFron[4]))
            msg = "stFron[4] = %s (not \\x15 or \\x16), so starting offset is unknown" % (val)
            Fronimo.print_error(self.stFile, msg)
            return -1
# end get_start_offset
    
# load changes from a json record into a Fronimo class instance
    def load_changes(self, dChanges):
        for k,v in dChanges.items():
            setattr(self, k, v)
# end of load_changes

# read and unzip a fronimo file with name self.stFile into self.stFron
    def read_and_unzip_file(self):
        self.flFron = gzip.open(self.stFile)
        if self.flFron == None:
            Fronimo.print_error(self.stFile, 'Cannot open and unzip fronimo file.')
            return False
        # Read entire file into a global byte array
        btFron = self.flFron.read()
        if len(btFron) < 100:
            Fronimo.print_error(self.stFile, 'Cannot read fronimo file.')
            return False
        self.stFron = btFron.decode("latin1")
        if len(self.stFron) < 100:
            Fronimo.print_error(self.stFile, 'Cannot decode fronimo file.')
            return False
        return True
# end of read_and_unzip_file

    # get x number of chars from string. Updates offset value
    def _get(self, numChars):
        if numChars == 0:
            return ""
        oldOffset = self.offset
        self.offset += numChars
        if (self.offset) > len(self.stFron):
            errMsg = "Cannot get %d chars starting at offset %d" % (numChars, oldOffset)
            Fronimo.print_error(self.stFile, errMsg)
            return ""
        return self.stFron[oldOffset:self.offset]

    # Get two bytes of info
    def _getWord(self, fSigned):
        inCh = self._get(1)
        if inCh == "":
            errMsg = "No first byte in stFron at offset %d" % (self.offset)
            Fronimo.print_error(self.stFile, errMsg)
            return -1
        word1 = ord(inCh)
        inCh = self._get(1)
        if inCh == "":
            errMsg = "No 2nd byte in stFron at offset %d" % (self.offset)
            Fronimo.print_error(self.stFile, errMsg)
            return -1
        word2 = 256 * ord(inCh)
        word = word1 + word2
        if (fSigned and word > 32768):
            word -= 65536
        return word

    # Get a fronimo-formatted string
    def _getBstr(self):
        # first byte is string length if < 255
        firstByte = self._get(1)
        if firstByte == False:
            errMsg = "No first byte in stFron at offset %d" % (self.offset)
            Fronimo.print_error(self.stFile, errMsg)
            return ""
        length = ord(firstByte)
        if length == 0:
            return ""
        # First char 255 means a long string.
        # Next 2 chars determine string length as an unsigned integer
        if length == 255:
            length = self._getWord(False)
            if length == -1:
                errMsg = "Zero string length from _getWord, offset %d" % (self.offset)
                Fronimo.print_error(self.stFile, errMsg)
                return ""
        stOut = self._get(length)
        if stOut == "":
            errMsg = "Result of get(length) is "", at offset %d" % (self.offset)
            Fronimo.print_error(self.stFile, errMsg)
            return ""
        return stOut
        
    def _put(self, stNew):
        # Assumes new string is RTF'd, if necessary, but not in fronimo string format,
        # with leading length indicator
        # Find length of old fronimo string
        ch = self._get(1)
        if ch == '\xFF': # means next 2 chars determine length
            byte1 = self._get(1)
            if byte1 == "":
                errMsg = "No first byte in stFron at offset %d" % (self.offset)
                Fronimo.print_error(self.stFile, errMsg)
                return False
            word1 = ord(byte1)
            byte2 = self._get(1)
            if byte2 == "":
                errMsg = "No second byte in stFron at offset %d" % (self.offset)
                Fronimo.print_error(self.stFile, errMsg)
                return False
            # second byte is higher order
            word2 = 256 * ord(byte2)
            # Length of actual string + 3 bytes to specify the length
            length = word1 + word2 + 3
            # reset offset to compensate for 3 _get(1)'s
            self.offset -= 3
        else:
            # Length of actual string + 1 byte to specify the length
            length = ord(ch) + 1
            # reset offset to compensate for 1 _get(1)
            self.offset -= 1
        # find length of new string
        newLen = len(stNew)
        if newLen < 255:
            stInsert = chr(newLen)
            # We will add one length indicator to the head of the string
            newLen += 1
        else:
            stInsert = chr(255) + chr(newLen % 256) + chr(int(newLen / 256))
            # We will add three length indicators to the head of the string
            newLen += 3
        # Add leading length indicator
        stNew = stInsert + stNew
        # splice in the new string
        stOut = self.stFron[:self.offset] + stNew + self.stFron[self.offset + length:]
        # set new offset
        self.offset += newLen
        self.stFron = stOut
        return True
# end _put
        

    def latin2rtf(self, stIn):
        stOut = ""
        for i in range (len(stIn)):
            s = stIn[i]
            if ord(s) >= 127:
                # hex value of latin1 char --> last 2 chars of rtf code
                hexchars = hex(ord(s))
                hexchars = hexchars.replace('0x', '')
                s = "\\\'" + hexchars
            elif s == '|':
                s = '\\par\r\n'
            stOut += s
        return RTFPREFIX + stOut + RTFSUFFIX


    def replace_rtf_codes(self, stIn):
        accentLoc = stIn.find('\\\'')
        while accentLoc >= 0:
            # Get last 2 characters of string \'xx = hex char value
            rtfCode = stIn[accentLoc + 2:accentLoc + 4]
            # convert to latin1 character
            latin1 = chr(int(rtfCode, 16))
            if latin1:
                stIn = stIn.replace(rtfCode, latin1, 1)
            stIn = stIn.replace("\\'", '', 1)
            accentLoc = stIn.find('\\\'')
        stIn = stIn.replace('\\par', '|')
        stIn = stIn.replace('\\cf1', '')
        stIn = stIn.replace('\\cf0', '')
        stIn = re.sub(r'\\lang[0-9]*', '', stIn)
        stIn = stIn.replace('  ', ' ')
        # KLUDGE to handle unicode weirdness
        val = reUnicode.search(stIn)
        if val:
            uni = val.group(2)
            # lop off final character from unicode sequence
            stIn = stIn.replace(uni, uni[:-1])
        return(stIn)

    def rtf2latin(self, stRtf):
        if stRtf.find('{\\rtf', 0) != 0:
            return stRtf
        if len(stRtf) < 100:
            return stRtf
        start = stRtf.find('\\f0\\fs', 0)
        if start == -1:
            return stRtf
        start += 9
        end = stRtf.find('\\par\r\n}\r\n', start + 1)
        if end == -1:
            return stRtf
        stOut = stRtf[start:end]
        stOut = stOut.replace('\\par\r\n', '\n')
        stOut = stOut.replace('\\{', '{')
        stOut = stOut.replace('\\}', '}')
        # KLUDGE to get rid of \langxxxx peculiarity that sometimes shows up
        # Might want to reinstate this if we can figure out what it means
        val = reDeLang.search(stOut)
        if val:
            lang1234 = val.group(1)
            stOut = stOut.replace(lang1234, "")
        stOut = stOut.strip(' \t')
        stOut = self.replace_rtf_codes(stOut)
        return stOut

    def get_composer0(self):
        if self.subtitle == '':
            return ''
        #Leave subtitle intact but mine for composer0
        pcont = reParenContents.search(self.subtitle)
        if pcont:
            inParen = pcont.group(2)
            hyphenIndex = inParen.find(' - ')
            if  hyphenIndex != -1:
                inParen = inParen[hyphenIndex + 3:]
        else:
            return ''
        return inParen

    def parse_document(self, doc):
        if doc == "":
            Fronimo.print_error(self.stFile, "No document")
            return False
        val = reGetDate.search(doc)
        # We have a date
        if val:
            self.date = val.group(1)
            self.date = self.date.strip()
            val = reApproxDate.search(self.date)
            if val:
                caMark = val.group(1)
                if caMark != "":
                    self.date = self.date.replace(caMark, "")
                self.date = "c." + self.date
            val = (re.search('\(\>?c?a?\.? ?[0-9]{4}\), *([fp#%]{1,2}\.* *[^.]*)\.?$',
                   doc, re.DOTALL))
            if val:
                self.page = val.group(1)
                self.page = self.page.replace(" ", "")
            else:
                self.page = ""
		# this is not really an error, per se.
                # Fronimo.print_error(self.stFile, "No page # in %s" % doc)
        else:
            self.date = ""
            Fronimo.print_error(self.stFile, "Cannot get date from %s" % doc)
            # Look for a page anyway (unlikely)
            val = re.search(', *([fp#%]{1,2}\.* *.*)\.$', doc, re.DOTALL)
            if val:
                self.page = val.group(1)
            else:
                self.page = ""
                Fronimo.print_error(self.stFile, "No page # in %s" % doc)
        # Get document without date and page
        val = re.search(' ?\(c?a?\.? ?[12][0-9]{3}\)', doc)
        if val:
            end = val.span()[0]
            self.document = doc[:end]
        else:
            self.document = doc
        self.document = self.document.strip()
        val = re.search("([^,][^,]*), *v[. ] *([^,]+)", self.document)
        if val == None:
            self.volume = '0'
        else:
            self.document = val.group(1)
            self.volume = val.group(2)
        return True
    #end of parse footnote

    def slugify(self,stIn):
#       stIn = stIn.lower()
        stIn = latin1_to_ascii(stIn)
        stIn = re.sub(': *', '_', stIn)
        stIn = stIn.replace("'", '_')
        stIn = re.sub(' +', '_', stIn)
        return(stIn)
        
    def make_pagedir(self):
        # document and volume obtained in parse_footnote
        # and self.source specified in populate if not in parse_footnote
        # Handle the source field first
        src = self.source
        if re.match('[A-Z]+-[A-Z]+[a-z]*', src):
            #it's a library, so slugify it
            src = self.slugify(src)
        else:
            # It's a proper name: a composer, intabulator, publisher, anthologist, etc.
            # First check if it is in names list
            src = src.replace('?', '')
            src = src.replace('\n', '')
            found = False
            for item in self.nameList:
                if item[1] == src:
                    #slugify it; item[0] contains existing slug
                    src = item[0]
                    #For last name with spaces
                    src = re.sub(' +', '_', src)
                    found = True
                    break
            if not found:
                # add to list of unknown names
                self.write_new_name(src)
                # So there won't be a knowable facsimile directory
                # if no knowable source field
                # So no point in continuing
                return ''
        # Now handle document field
        document = self.slugify(document)
        facurl = FACHEAD + src + '/' + document
        # 2 different handlings depending on whether there is a volume
        if self.volume:
            # make entry for volume directory, appending date
            # assumes a document directory doesn't have appended date when there are volumes.
            # and the volume files carry the date
            stVol = "v." + self.volume + '_' + self.date
            pageDir = facurl + '/' + stVol
        else:
            # assumes document directory has appended date if no volumes
            pageDir = facurl + '_' + self.date
        return pageDir
    # end of make_pagedir
        
    def parse_credits(self, cred):
        # expand abbreviations [Ee]d. [Ee]nc, and &
        cred = cred.replace(' & ', ' and ')
        cred = re.sub('\.$', '', cred)
        if reEd.search(cred):
            cred = cred.replace('d.', 'dited', 1)
        if reEnc.search(cred):
            cred = cred.replace('nc.', 'ncoded', 1)
        # Change [IiEe]ntabulated to Encoded
        val = reEntabulated.search(cred)
        if val:
            src = val.group(1)
            cred = cred.replace(src, "Encoded")
        #Handle "by", "and", and  ";" in credits string
        val = re.search('[Ee](?:dited|ncoded) and [Ee](?:dited|ncoded) by (..*)$', cred, re.DOTALL)
        if val:
            if val.group(1) == 'S.Gerbode':
                self.editor = self.encoder = 'Sarge Gerbode'
            else:
                self.encoder = val.group(1)
                self.encoder = self.encoder.replace('S.Gerbode','Sarge Gerbode')
                self.editor = self.encoder
            return True
        val = re.search('(E(?:ncoded|dited)) by (..+) ?(?:[;.]|and) ([Ee](?:dited|ncoded)) by (..*)$', cred, re.DOTALL)
        if val:
            type1 = val.group(1)
            cred1 = val.group(2)
            type2 = val.group(3)
            cred2 = val.group(4)
            if type1 == "Edited":
                self.editor = cred1.strip()
                self.encoder= cred2.strip()
            else:
                self.encoder = cred1.strip()
                self.editor = cred2.strip()
            self.editor = self.editor.replace('S.Gerbode', 'Sarge Gerbode')
            self.encoder = self.encoder.replace('S.Gerbode', 'Sarge Gerbode')
            return True
        self.editor = self.encoder = ""
        return False
    # End parse_credits

    # sets source, document, volume, date, page, encoder, editor
    def parse_footnote(self):
        if self.footnote == "":
            return
        self.source = self.document =  self.volume = self.date = self.page = self.encoder = self.editor = ""
        lsParts = re.split('  +', self.footnote)
        numParts = len(lsParts)
        if numParts < 2 or numParts > 3:
            stErr = "Footnote \"%s\" has wrong # of parts (%d)" % (self.footnote, numParts)
            Fronimo.print_error(self.stFile, stErr)
            return False
        if numParts == 2:
           # source == ''; later, source will = composer
           doc, cred = lsParts
        else:
            self.source, doc, cred = lsParts
#            return False
        if not self.parse_document(doc):
            Fronimo.print_error(self.stFile, "Cannot parse document:%s" % doc)
            return False
        if not self.parse_credits(cred):
            Fronimo.print_error(self.stFile, "Cannot parse credits: %s." % cred)
        return True
    # End parse_footnote

    def get_diff_val(self, difficulty):
        diff = difficulty.strip()
        if diff:
            diff = difficulty[:3]
            # convert to lower case
            diff = diff.lower()
        else:
            diff = '3' # Medium is default difficulty value
        if diff in ['0', '1', '2', '3', '4', '5', '6']:
            return ord(diff) - 48
        elif diff == "beg":
            return 1
        elif diff in ('eas', 'sim'):
            return 2
        elif diff == 'med':
            return 3
        elif diff == 'cha':
            return 4
        elif diff in ('dif', 'har'):
            return 5
        elif diff in ('vir', 'kil'):
            return 6
        else:
            stErr = "Difficulty value \"%s\" not meaningful" % (difficulty)
            Fronimo.print_error(self.stFile, stErr)
            return 0

    def parse_info(self):
        # initialize with existing values
        global instCount
        global partCount
        lsInfo = re.split('\n', self.info)
        isRemark = False
        remarks = ""
        for datum in lsInfo:
            datum = datum.strip()
            # Ignore blank lnes
            if datum == "":
                continue
            if isRemark:
                if remarks:
                    remarks = remarks + '|' + datum 
                else:
                    remarks = datum
            else:
                isRemark = (datum.find('--') == 0)
                if isRemark:
                # everything after line stating with '--' is a remark
                    continue
                if datum.find(':') == -1:
                    continue
                # so it is a field
                # decode it
                reItem = reInfo.search(datum)
                if reItem:
                    field = reItem.group(1)
                    value = reItem.group(2)
                    value = value.strip()
                    field = field.lower()
                    if field in ["tra", "rea", "arr"]:
                        self.arranger = value
                    elif field in ["lib", "sou", "pub" ]:
                        self.source = value
                    elif field in["ins", "ens" ]:
                        self.ensemble = value
                    elif field in ["doc"]:
                        self.document = value
                    elif field in ["ori", "co0"]:
                        self.composer0 = value
                    elif field == "tit":
                        self.title = value
                    elif field == "sub":
                        self.subtitle = value
                    elif field == "com":
                        self.composer = value
                    elif field == "doc":
                        self.document = value
                    elif field == "pag":
                        self.page = value
                    elif field == "edi":
                        self.editor = value
                    elif field == "enc":
                        self.encoder = value
                    elif field == "int":
                        self.intabulator = value
                    elif field == "con":
                        self.concordances = value
                    elif field == "pie":
                        self.piece = value
                    elif field == "fac":
                        self.facurl = value
                    elif field == "rec":
                        self.recurl = value
                    elif field == "sec":
                        self.section = value
                    elif field == "typ":
                        self.type = value
                    elif field == "key":
                        self.key = value
#                        if reKey.search(self.key) == None:
#                            stErr = "Key \"%s\" missing or meaningless" % (self.key)
#                            Fronimo.print_error(self.stFile, stErr)
#                            self.key = "??"
                    elif field == "dif":
                        self.difficulty = self.get_diff_val(value)
                    elif field == "par":
                        self.part = value
                    else:
                        stErr = "Info field \"%s\ ""not found" %(field)
                        Fronimo.print_error(self.stFile, stErr)
                else:
                    continue
        self.remarks = remarks
#       checking to see of all parts list items are in the ensemble list.
#       first collect all items in the ensemble, including tags
        ensList = self.ensemble.split(",")
        insList = []
        ensItems = []
        for ens in ensList:
            ens = ens.strip()
            ens = ens.lower()
            if ens.find(":") > 0:
                instag = ens.split(":")
#           Include tags for part check but not for instrument check
                ensItems.append(instag[0].strip())
                ensItems.append(instag[1].strip())
                insList.append(instag[1].strip())
            else:
                insList.append(ens)
                ensItems.append(ens)
#       Then check part list item against them
        if self.part != "":
            partList = self.part.split(",")
            for ins in partList:
                ins = ins.strip()
                ins = ins.lower()
                if ins == "score":
                    continue
                if not ins in ensItems:
                    stErr = "Part \"%s\" not in ensemble list" % (ins)
                    partCount += 1
                    Fronimo.print_error(self.stFile, stErr)
        for ins in insList:
            found = False
            ins = ins.strip()
            ins = ins.lower()
            for item in Fronimo.instList:
                if item[0] == ins:
                    found = True
                    break
            if not found:
                instCount += 1
                stErr = "Inst. \"%s\" not found in \"%s\"." % (ins, self.stFile)
                print(stErr, file=Fronimo.flNewInsts)
                Fronimo.flNewInsts.flush()
        # end of for datum in lsInfo
        return True
# end of parse_info

    def write_new_name(self, newName):
        global nameCount
        nameCount += 1
        stOut = "%s --> %s" % (newName, self.stFile)
        print(stOut, file=Fronimo.flNewNames)
        Fronimo.flNewNames.flush()

    def check_name(self, name):
        found = False
        name = name.replace('?', '')
        name = name.replace('\n', '')
        # write out list of names not found in names list
        val = reSplitAndOr.search(name)
        if val:
            lsName = [val.group(1)] + [val.group(2)]
        elif name:
            lsName = [name]
        else:
            lsName = []
        for nm in lsName:
            found = False
            for item in Fronimo.nameList:
                if item[1] == nm:
                    found = True
                    break
            if not found:
                self.write_new_name(nm)
                return False
        return found
# end of check_name

    def write_new_types(self):
        # get values from comma-separated list
        types = self.type.split(",")
        for typ in types:
            found = False
            typ = typ.strip()
            typ = typ.lower()
            typ = typ.replace("?", "")
            for item in Fronimo.typeList:
                if item[0] == typ:
                    found = True
                    break
            if not found:
                global typeCount
                typeCount += 1
                stOut = "%s --> %s" % (typ, self.stFile)
                print(stOut, file=Fronimo.flNewTypes)
                Fronimo.flNewTypes.flush()
# end of write_new_types

# Get 0 padding for pages in the directory.
# returns [pad, suffix]
    def get_pad(self, pageDir):
        suffix = ''
        pad = 0
        lastPage = 0
        #get a list of page file names
        arPage = os.listdir(pageDir)
        for page in arPage:
            suf = page[-4:]
            suf = suf.lower()
            if not suf in ['.png', '.tif', '.pdf', '.jpg']:
                continue
            suffix = suf
            page = page[:-4]
            # strip position on page designation
            if page[-1] in list('abcdefghijklm'):
                pagePos = page[-1]
                page = page[:-1]
            else:
                # Get the last page number
                pagePos = ''
            # Characters to correct for mispagination in orig.
            # Like if there are 2 page 8's, second one could be 8s, third one 8t, etc.
            if page[-1] in ['x', 'y', 'z']:
                page = page[:-1]
            if page.isdigit():
               currPage = int(page) 
               if currPage > lastPage:
                   lastPage = currPage
        # file numbers are 0 padded
        pad = 0
        if lastPage >= 100:
            pad = 3
        elif lastPage >= 10:
            pad = 2
        elif lastPage >= 1:
            pad = 1
        return [pad, suffix]
#end get_pad

# Convert self.page into a valid directory entry
    def get_page_entry(self, pad):
        # blank page or page containing % or # guarantees no facsimile file
        if self.page == '' or re.search('[#%]', self.page) != None:
            return ''
        val = re.search('^[fp][fp]*\. *([^,.][^,.]*)', self.page)
        if val == None:
            errmsg = "cannot parse page # %s" % (self.page)
            Fronimo.print_error(self.stFile, errmsg)
            return ''
        else:
            page = val.group(1)
        # strip position on page designation
        if page[-1] in list('abcdefghijklm'):
            lastChar = page[-1]
            page = page[:-1]
        else:
            lastChar = ''
        if page[-1] == 'v':
            verso = 'v'
            page = page[:-1]
        else:
            verso = ''
        #To handle misnumberings, may have to add x, y, or z to page numbers.
        if page[-1] in ['x', 'y', 'z']:
            extra = page[-1]
            page = page[:-1]
        else:
            extra = ''
        # Special case for leading pages like 00a, 000b, etc.
        # or like a2, l4v, etc.
        if (not page.isdigit()):
            return page + extra + verso
        if pad > 1:
            page = page.zfill(pad)
        #Special case for pages like 000a, 000b, etc.
        if int(page) == 0:
            page = page + lastChar
        return(page + extra + verso)
# end get_page_entry

    def make_pagedir(self):
        # document and volume obtained in parse_footnote
        # and self.source specified in populate if not in parse_footnote
        # Handling the source field
        src = self.source
        if re.match('[A-Z]+-[A-Z]+[a-z]*', src):
            # Get rid of apostrophes, colons, and spaces in library name
            src = self.slugify(src)
        else:
            # It's a proper name, a composer, intabulator, publisher, anthologist, etc.
            # First check if it is in names list
            found = False
            for item in self.nameList:
                if item[1] == src:
                    #slugify it
                    src = item[0]
                    # Handle spaces in the last name
                    src = src.replace(' ', '_') 
                    found = True
                    break
            if not found:
                # it's not going to have a facurl if src is a compound name
                # but make sure we pick up any stray names anyway
                self.check_name(src)
                return('')
        # lower case document,get rid of accents and replace spaces and colons with underlines
        document = self.slugify(self.document)
        facurl = FACHEAD + src + '/' + document
        # 2 different handlings if there is a volume
        if self.volume:
            # make entry for volume directories, appending date
            # assumes a document directory doesn't have appended date when there are volumes.
            stVol = "v." + self.volume + '_' + self.date
            pageDir = facurl + '/' + stVol
        else:
            # assumes document directory has appended date if no volumes
            pageDir = facurl + '_' + self.date
        return pageDir
    # end of make_pagedir 

    #Starts with an uncompressed fronimo string (stFron) and reads values into
    # an instance of a fronimo class object.
    def populate(self):
        global facCount
        #Assume we can parse footnote unless proven otherwise.
        fFootnote = True
        self.offset = self.start_offset
        pg = self._getBstr() # Skip page number string
        self.footnote = self._getBstr()
        self.footnote = self.footnote.strip()
        if not self.parse_footnote():
            Fronimo.print_error(self.stFile,
            "Cannot parse footnote:\n%s" % self.footnote) 
            fFootnote = False
        newPos = self.stFron.find('CPiece')
        # This is very unlikely to happen
        if newPos == -1:
            Fronimo.print_error(self.stFile, "Cannot find pattern 'CPiece'")
            return False
        self.offset = newPos + 14
        self.title = self._getBstr()
        self.title = self.title.strip(' \t')
        self.title = self.rtf2latin(self.title)
        self.subtitle = self._getBstr()
        self.subtitle = self.subtitle.strip('[\t ]')
        self.subtitle = self.rtf2latin(self.subtitle)
        self.subtitle = self.subtitle.replace('\n', '|')
        self.composer0 = self.get_composer0()
        self.composer = self._getBstr()
        self.composer = self.composer.strip('[\t ]')
        self.composer = self.rtf2latin(self.composer)
        if self.source == "":
            self.source = self.composer.replace('?', '')
        if self.source == 'Anonymous':
            self.source = 'Unknown'
        self._getBstr() # Discard text at beginning and end of section
        self._getBstr()
        self.info = self._getBstr()
        self.info = self.info.strip()
        self.info = self.info.replace ('\r', '')
        # Skip key field; will get key from info field
        # offset = stFron.find('CBAR',offset) + 10
        # keyNum = self._getWord(True)
        self.composer = self.composer.replace('Anon.', 'Anonymous')
        self.composer0 = self.composer0.replace('Anon.', 'Anonymous')
        return True
# End Populate

def main():
    flBadSigs = open('badsigs.txt', "w", encoding = 'latin1')
    if not flBadSigs:
        print("Cannot open bad signatures file.")
        return False
    count = 0
    oldDir = ""
    lsAll = []
# Walk the main directory tree
    for root, dirs, files in os.walk(CURRDIR):
# Only when changing root directories do we collect a list of .ft3 files
        root = root.replace("\\", '/')
        if root == oldDir:
            continue
        # Now we have a new "old directory"
        oldDir = root
    # Won't find any .ft3 files in directory trees starting with BADDIRS
        # so don't process these
        for target in BADDIRS:
            result = root.find("/" + target)
            if result != -1:
                break
        if result != -1:
            continue
        # Get a list of .ft3 files in this directory
        ft3s = [f for f in files if f.endswith('.ft3')]
        # Process each .ft3 file in the list.
        for fname in ft3s:
            # collects info from fronimo file, formats it, and outputs it.
            # open and unzip fronimo file
            fron = Fronimo(root + '/' + fname)
            if fron.start_offset <= 0:
            # means file not processed correctly
                 print(fron.stFile + ' ' + 'damaged.')
                 continue
            signature = [fron.source,fron.document,fron.volume,fron.page]
            other = [fron.composer.rstrip(), fron.composer0.rstrip(), fron.title.rstrip()]
            lsValues = [signature, other, fron.stFile]
            lsAll.append(lsValues)
        # end for fname in ft3s
    # end if root != oldDir
    # After walking the directories.
    lsAll.sort()
    for i in range(len(lsAll)):
        if i > 0:
            # if 2 consecutive signatures match
            if lsAll[i][0] == lsAll[i-1][0]:
                # And other data doesn't match
                # Allowing for added numberings
                currTitle = lsAll[i][1][2]
                lastTitle = lsAll[i-1][1][2]
                currTitle = re.sub("^[0-9 .]+", "", currTitle)
                currTitle = re.sub("[0-9 ]+$", "", currTitle)
                lastTitle = re.sub("^[0-9 .]+", "", lastTitle)
                lastTitle = re.sub("[0-9 ]+$", "", lastTitle)
                currTitle = currTitle.lower()
                lastTitle = lastTitle.lower()
                # Substitute in titles without numbers
                lsAll[i][1][2] = currTitle
                lsAll[i-1][1][2] = lastTitle
                # Now check if equal
                if lsAll[i][1] != lsAll[i-1][1]:
                    location = f"{lsAll[i][0][0]}|{lsAll[i][0][1]}|{lsAll[i][0][2]}|{lsAll[i][0][3]}|"
                    print (location, file=flBadSigs)
                    print('in', lsAll[i-1][2], 'and', file=flBadSigs)
                    print(f"{lsAll[i][2]}", file=flBadSigs)
                    print(f"{lsAll[i-1][1]}!=", file=flBadSigs)
                    print(f"{lsAll[i][1]}", file=flBadSigs)
                    print('--------------', file=flBadSigs)
                    count += 1
    print(str(count) +  ' ' +  'bad/duplicate signatures.', file=flBadSigs)
    print(str(count) +  ' ' +  'bad/duplicate signatures.', file=sys.stderr)
# end main

if __name__ ==  "__main__":
    main()