""" This checks for bad or incomplete signatures. """ PYTHONBREAKBOINT = 0 #disallow breakpoints from collections import Counter from unicode import latin1_to_ascii import json import pdb import re import os import sys import time import glob import shutil from pprint import pprint, pformat import gzip import csv import platform import inspect from functools import partial import os.path #Globals currOS = platform.platform() if currOS.find('Windows') == 0: LOCALBASE = 'C:/website/' #LOCALBASE = 'A:/test/' else: LOCALBASE = '/ssd/home/sarge/prog/python/fron' HTTPBASE = "http://gerbode.net/" CURRDIR = re.sub('\\\\+', '/', os.getcwd()) + '/' FACHEAD = LOCALBASE + "facsimiles/" instCount = 0 #count of new/incorrect instruments partCount = 0 #count of items in partCount that are int in ensemble typeCount = 0 #count of new/incorrect types nameCount = 0 #count of new/incorrect proper names: composers, publishers, anthologists, etc. facCount = 0 #count of unmatched facsimile strings newFacs = 0 # Count of numatched missing facsimiles def join_with_fslash(s1, s2): return(s1 + '/' + s2) # end join_with_fslash startTime = int(time.time()) #Data/input files contained in local website base CONTRIBDIR = LOCALBASE + "contributors" NAMESFILE = LOCALBASE + "namedata.tsv" INSTSFILE = LOCALBASE + "instruments.tsv" TYPEFILE = LOCALBASE + "types.tsv" TEMPLATE = LOCALBASE + "templates/template.ft3" MISSINGPAGES = LOCALBASE + "missing_pages.txt" NOFACDIRS = LOCALBASE + "noFacDirs.txt" # Error/output files located in current directory NEWFACDIRS = CURRDIR + "newFacDirs.txt" NEWNAMES = CURRDIR + "newnames.txt" # Nonexistent facsimiles in valid directories FACERRS = CURRDIR + "facerrs.txt" NEWTYPES = CURRDIR + "newtypes.txt" NEWINSTS = CURRDIR + "newinsts.txt" DERRFILE = CURRDIR + "dfterrs.txt" CERRFILE = CURRDIR + "cfterrs.txt" EERRFILE = CURRDIR + "efterrs.txt" TERRFILE = CURRDIR + "tfterrs.txt" TSVFILE = CURRDIR + "dft.tsv" JSONFILE = CURRDIR + "dft.json" # directories where we don't look for fronimo files BADDIRS = ['midi', 'tabs', 'pdf', 'other', 'videos', 'TEMP', 'old_dft.pls', 'fronimo', 'images', 'icons', 'index_files', 'ftp', 'facsimiles', 'making_lute_music_accessible_files', 'contributors'] # For converting RTF files to latin1 and vice versa RTFPREFIX = "{\\rtf1\\ansi\\ansicpg1252\\deff0\\deflang1033{\\fonttbl{\\f0\\fnil\\fcharset0 MS Shell\r\nDlg;}}\r\n\\viewkind4\\uc1\\pard\\f0\\fs-22 "; RTFSUFFIX = "\\par\r\n}\r\n"; IMAGETYPES = ('.png', '.tif', '.pdf', '.jpg') # Compiled regular expressions #use re.DOTALL to match \n as well # = number in a series; % = arbitrary number to make source unique reUnicode = re.compile('(\\\\u.... )') reSplitAndOr = re.compile('(..*) (?:and|or) (..*)') reGetDate = re.compile('\D\((\>?c?a?\.? ?[1-2][0-9]{3})\)', re.DOTALL) reApproxDate = re.compile('(ca?\.? ?)[1-2][0-9]{3}', re.DOTALL) reParenContents = re.compile('^([^(]*)\(([^)]+)\)(.*)', re.DOTALL) reEntabulated = re.compile('([IiEe]ntabulated)', re.DOTALL) reEncoded = re.compile('[Ee](ncoded)', re.DOTALL) reEdited = re.compile('[Ee](dited)', re.DOTALL) reEnc = re.compile(r'[Ee](nc\.)', 1) reEd = re.compile(r'[Ee](d\.)', 1) reInfo = re.compile('^(...).*?: *(..*)$', re.DOTALL) reKey = re.compile('^[A-G][b#]*[Mm]$',re.DOTALL) reGetDictItem = re.compile('"([^"]*)"[^"]*"([^"]*)"') reDeLang = re.compile(r'(\\lang[0-9]{4)}') # These are for error messages # for current func name, specify n = 0 or no argument. # for name of caller of current func, specify 1. # for name of caller of caller of current func, specify 2. etc. funcName = lambda n=1: sys._getframe(n + 1).f_code.co_name callerName = lambda n=2: sys._getframe(n + 1).f_code.co_name lineNo = lambda n=1: sys._getframe(n + 1).f_lineno # insert a string into another string at a specific location def insert_str(stInsert, str, index): return str[:index] + strInsert + str[index:] def at_eof(f): return f.tell() == os.fstat(f.fileno()).st_size def show_tuple(t): for item in t: print("%s, " % item, end = "") print("") difficulties = ["??", "Beginner", "Easy", "Medium", "Challenge", "Difficult", "Virtuoso"] field_map = [ ("Title", "title"), ("Subtitle", "subtitle"), ("Composer", "composer"), ("Orig. composer", "composer0"), ("Footnote", "footnote"), ("Source", "Source"), ("Document", "document"), ("Volume", "volume"), ("Date", "date"), ("Page", "page"), ("Editor", "editor"), ("Encoder", "encoder"), ("Arranger", "arranger"), ("Intabulator", "intabulator"), ("Concordances", "concordances"), ("Contributor", "contributor"), ("Info", "info"), ("Piece", "piece"), ("Section", "section"), ("Type", "type"), ("Key", "key"), ("Difficulty", "difficulty"), ("Ensemble", "ensemble"), ("Part", "part"), ("Remarks", "remarks"), ("Recording", "recurl"), ("Facsimile", "facurl"), ("Fronimo", "stFile"), ("PDF", "stPdf"), ("Midi", "stMidi"), ("Modified", "mtime"), ("Created", "ctime"), ] key_order = [ # for dumping "title", "subtitle", "composer", "composer0", "footnote", "source", "document", "volume", "date", "page", "editor", "encoder", "arranger", "intabulator", "contributor", "concordances", "info", "piece", "section", "type", "key", "difficulty", "ensemble", "part", "remarks","recurl", "facurl", "stFile", "stPdf", "stMidi", "ctime", "mtime", ] #Creates fronimo object from file with name stIn class Fronimo: def __init__(self, stIn = TEMPLATE): # set all attributes to empty string for col,att in field_map: setattr(self, att, "") self.base = LOCALBASE self.currProg = '' self.credits = '' self.begText = '' self.endText = '' self.performance = False self.simple = False self.ornamented = False self.footnote = "" #input file name set from argument stIn self.stFile = stIn self.stFron = '' self.flFron = None self.flOut = None self.flErr = None self.offset = None self.oldOffset = None self.volume = "" self.info = "" self.recurl = "" self.facurl = "" #keep track of padding of the last dir contacted that contains numerical pages #self.lastPageDir = ["", 0] # Creates self.stFron from fronimo file: self.stFile if self.read_and_unzip_file() == False: # marker for failure of class instantiation self.start_offset = -1 else: self.start_offset = self.get_start_offset() if self.start_offset > 0: # Fills in other values of fronimo object. if self.populate() == False: self.start_offset = -1 # end of __init___ # Open error file @classmethod def open_error(kls, errfile): try: kls.flErr = open(errfile, "w", encoding = 'latin1') except OSError: print("OSError: Cannot open error File ", errfile, " for writing.", file=sys.stderr) return False except: print("Other error: Cannot open error File ", errfile, " for writing.", file=sys.stderr) return False return True # End open_error @classmethod def open_file(kls, fl, mode): try: flName = open(fl, mode, encoding='latin1') except OSError: err= "OSError: Cannot open file %s in mode %s." % (fl, mode) kls.print_error(fl, err) return None except: err= "Other error: Cannot open file %s in mode %s." % (fl, mode) kls.print_error(fl, err) return None return flName # end of openFile # open and read all needed fronimo-related files @classmethod def open_files(kls): kls.nameList = [] kls.typeList = [] kls.instList = [] kls.noFacList = [] # Read in and process list of recognized names kls.flNames = kls.open_file(NAMESFILE, "r") if not kls.flNames: print("Cannot open NAMESFILE file.") return False stFacNames = kls.flNames.read() lsIn = stFacNames.split('\n') for line in lsIn: lsRec = line.split('\t') if line == '': continue lsRec = line.split('\t') # First, get the default directory name for this proper name source # That's capitalized last name + capitalized first letter of of first name # Get the first and last names from 2nd list record lastFirst = lsRec[1] if (',' in lastFirst): val = re.search('^([^,][^,]+), *(..*)', lastFirst) last = val.group(1) first = val.group(2) slug = last + first[0] else: # Unless there is no first name slug = lsRec[0] # Remove accents slug = latin1_to_ascii(slug) # Prepend it to the list record lsRec.insert(0, slug) # and add the record to the names list. kls.nameList.append(lsRec) # Read in instrument list kls.flInsts = kls.open_file(INSTSFILE, "r") if not kls.flInsts: print("Cannot open INSTSFILE file.") return False stInst = kls.flInsts.read() lsIn = stInst.split('\n') for line in lsIn: lsRec = line.split('\t') kls.instList.append(lsRec) # Read in types list kls.flTypes = kls.open_file(TYPEFILE, "r") if not kls.flTypes: print("Cannot open TYPEFILE file.") return False stType = kls.flTypes.read() lsIn = stType.split('\n') for line in lsIn: lsRec = line.split('\t') kls.typeList.append(lsRec) #Read in past references to nonexistent facsimiles #A time saver, but needs to be updated as facsimiles are added kls.flNoFacDirs = kls.open_file(NOFACDIRS, "r") if not kls.flNoFacDirs: print("Cannot open NOFACDIRS file.") return False # Read in the whole list as is stIn = kls.flNoFacDirs.read() kls.noFacList = stIn.split('\n') #Read in missing pages list #A time saver, but needs to be updated as facsimiles are added kls.flMissingPages = kls.open_file(MISSINGPAGES, "r") if not kls.flMissingPages: print("Cannot open MISSINGPAGES file.") return False # Read in the whole list as is stIn = kls.flMissingPages.read() kls.missingPageList = stIn.split('\n') kls.flNewFacDirs = kls.open_file(NEWFACDIRS, "w") if not kls.flNewFacDirs: print("Cannot open NEWFACDIRS file.") return False #Open error lists kls.flNewNames = kls.open_file(NEWNAMES, "w") if not kls.flNewNames: print("Cannot open NEWNAMES file.") return False kls.flNewTypes = kls.open_file(NEWTYPES, "w") if not kls.flNewTypes: print("Cannot open NEWTYPES file.") return False kls.flNewInsts = kls.open_file(NEWINSTS, "w") if not kls.flNewInsts: print("Cannot open NEWINSTS file.") return False kls.flFacErrs = kls.open_file(FACERRS, "w") if not kls.flFacErrs: print("Cannot open FACERRS file.") return False return True # end open_files @classmethod def print_headers(kls): print("[", file=kls.flJson) print("Title\tSubtitle\tComposer\tOrig. composer\tSource\tDocument\tVolume\tDate\tPage\tEditor\tEncoder\tArranger\tIntabulator\tContributor\tConcordances\tPiece\tSection\tType\tKey\tDifficulty\tEnsemble\tPart\tRemarks\tRecording\tFacsimile\tFronimo\tPDF\tMidi\tCreated\tModified", file=kls.flTsv) # end print_headers # for future use @classmethod def make_contrib_dir(kls): s= self.contributor.casefold() s = s.replace('.', '') s - s.replace(' ', '_') self.contribDir = join_with_fslash(CONTRIBDIR, s) if not os.path.exists(self.contribDir): os.makedirs(self.contribDir) # Get canonical name for type @classmethod def find_canonical_type(kls,typ): typ = typ.strip() typ = typ.lower() for item in kls.typeList: if item[0] == typ: return item[1] return None #end find_canonical_type # Finds all types in a hierarchy below a given canonical type @classmethod def get_all_types(kls, typ): typOut = [typ] for t in kls.typeList: if len(t) == 3: #only look at items that have higher types # if there is an "and" tpH = t[2].split('&') # See if the search pattern matches one of the alternatives if typ in tpH: newType = kls.get_all_types(t[1]) if newType: # No duplications allowed if not newType in typOut: typOut = typOut + newType return typOut # Starts with a comma-separated list of types, validates them, #gets canonical name, # and returns the validated list with all sub-types @classmethod def get_type_list(kls, typesIn): typelist = typesIn.split(',') typesOut = [] for typ in typelist: t = kls.find_canonical_type(typ) if t == None: print("Type %s not found." % typ) # Fronimo.print_error("Type %s not found." % typ) continue else: typesOut += kls.get_all_types(t) return(typesOut) # print helpful error message @classmethod def print_error(kls, currFile, errMsg): print("In %s; caller:%s; line:%d of %s\nfile:%s; %s." % (funcName(), callerName(), lineNo(), kls.currProg, currFile, errMsg), file=kls.flErr) kls.flErr.flush() # correct all the files given in a json file @classmethod def correct_all(kls, f): count = 0 line = f.readline() # Find and discard the '[' at the beginning of the file while not at_eof(f) and not line.find('[') in [0,1,2,3,4,5]: line = f.readline() while not at_eof(f) and not line.find(']') in [0,1,2,3,4,5]: # Find and discard { at beginning of json record while not at_eof(f) and not line.find('{') in [0,1,2,3,4,5]: line = f.readline() if at_eof(f): return(count) dChanges = {} # Read the json record up to the "}" and load dChanges while not at_eof(f) and not line.find('}') in [0,1,2,3,4,5]: # Find the first quote while not at_eof(f) and not line.find('"') in [0,1,2,3,4,5]: line = f.readline() if at_eof(f): return(count) line = line.replace('\\"', '\xA4') item = reGetDictItem.search(line) if item == None: Fronimo.print_error(kls.stFile, "cannot parse json line, %s.", line) continue typ = item.group(1) val = item.group(2).replace('\xA4','"') dChanges[typ] = val line = f.readline() # Load fron file corresponding to this record fronFile = dChanges['stFile'] if not os.path.isfile(fronFile): msg = "File %s in dft.json does not exist" % (fronFile) print(msg, file=kls.flErr) continue fron = Fronimo(fronFile) #fron.stFron now created from this fronimo file. # and fronimo object fron populated with values from that file. # punt if corresponding file not found if fron.start_offset == -1: continue # Save old fronimo string. stOldFron = fron.stFron # Load changes from json record into fron, replacing old values fron.load_changes(dChanges) if not fron.depopulate(): Fronimo.print_error(kls.stFile,"Cannot update fronimo file string.") continue # if no changes to make, do nothing. if stOldFron == fron.stFron: continue if not fron.write_file(False): Fronimo.print_error(kls.stFile, "Cannot write out fronimo file.") continue count += 1 line = f.readline() return count #end correct_all #print missing facsimile pages def no_fac_page(self, location): global newFacs # No point in printing out facsimile directory location location = location.replace(FACHEAD, '') # Print it out if not already known to be missing if not location in self.missingPageList: newFacs +=1 ft3_file = self.stFile.replace(LOCALBASE, '') ft3_file = ft3_file.replace('composers', 'cmps') ft3_file = ft3_file.replace('sources', 'srcs') msg = ft3_file + ' ||| ' + location print(msg, file=Fronimo.flFacErrs) #End of no_fac # Get starting offset for reading/writing a fronimo file def get_start_offset(self): if (self.stFron[4] == '\x15') or (self.stFron[4] == '\x14'): return 364 elif self.stFron[4] == '\x16': return 368 else: val = hex(ord(self.stFron[4])) msg = "stFron[4] = %s (not \\x15 or \\x16), so starting offset is unknown" % (val) Fronimo.print_error(self.stFile, msg) return -1 # end get_start_offset # load changes from a json record into a Fronimo class instance def load_changes(self, dChanges): for k,v in dChanges.items(): setattr(self, k, v) # end of load_changes # read and unzip a fronimo file with name self.stFile into self.stFron def read_and_unzip_file(self): self.flFron = gzip.open(self.stFile) if self.flFron == None: Fronimo.print_error(self.stFile, 'Cannot open and unzip fronimo file.') return False # Read entire file into a global byte array btFron = self.flFron.read() if len(btFron) < 100: Fronimo.print_error(self.stFile, 'Cannot read fronimo file.') return False self.stFron = btFron.decode("latin1") if len(self.stFron) < 100: Fronimo.print_error(self.stFile, 'Cannot decode fronimo file.') return False return True # end of read_and_unzip_file # get x number of chars from string. Updates offset value def _get(self, numChars): if numChars == 0: return "" oldOffset = self.offset self.offset += numChars if (self.offset) > len(self.stFron): errMsg = "Cannot get %d chars starting at offset %d" % (numChars, oldOffset) Fronimo.print_error(self.stFile, errMsg) return "" return self.stFron[oldOffset:self.offset] # Get two bytes of info def _getWord(self, fSigned): inCh = self._get(1) if inCh == "": errMsg = "No first byte in stFron at offset %d" % (self.offset) Fronimo.print_error(self.stFile, errMsg) return -1 word1 = ord(inCh) inCh = self._get(1) if inCh == "": errMsg = "No 2nd byte in stFron at offset %d" % (self.offset) Fronimo.print_error(self.stFile, errMsg) return -1 word2 = 256 * ord(inCh) word = word1 + word2 if (fSigned and word > 32768): word -= 65536 return word # Get a fronimo-formatted string def _getBstr(self): # first byte is string length if < 255 firstByte = self._get(1) if firstByte == False: errMsg = "No first byte in stFron at offset %d" % (self.offset) Fronimo.print_error(self.stFile, errMsg) return "" length = ord(firstByte) if length == 0: return "" # First char 255 means a long string. # Next 2 chars determine string length as an unsigned integer if length == 255: length = self._getWord(False) if length == -1: errMsg = "Zero string length from _getWord, offset %d" % (self.offset) Fronimo.print_error(self.stFile, errMsg) return "" stOut = self._get(length) if stOut == "": errMsg = "Result of get(length) is "", at offset %d" % (self.offset) Fronimo.print_error(self.stFile, errMsg) return "" return stOut def _put(self, stNew): # Assumes new string is RTF'd, if necessary, but not in fronimo string format, # with leading length indicator # Find length of old fronimo string ch = self._get(1) if ch == '\xFF': # means next 2 chars determine length byte1 = self._get(1) if byte1 == "": errMsg = "No first byte in stFron at offset %d" % (self.offset) Fronimo.print_error(self.stFile, errMsg) return False word1 = ord(byte1) byte2 = self._get(1) if byte2 == "": errMsg = "No second byte in stFron at offset %d" % (self.offset) Fronimo.print_error(self.stFile, errMsg) return False # second byte is higher order word2 = 256 * ord(byte2) # Length of actual string + 3 bytes to specify the length length = word1 + word2 + 3 # reset offset to compensate for 3 _get(1)'s self.offset -= 3 else: # Length of actual string + 1 byte to specify the length length = ord(ch) + 1 # reset offset to compensate for 1 _get(1) self.offset -= 1 # find length of new string newLen = len(stNew) if newLen < 255: stInsert = chr(newLen) # We will add one length indicator to the head of the string newLen += 1 else: stInsert = chr(255) + chr(newLen % 256) + chr(int(newLen / 256)) # We will add three length indicators to the head of the string newLen += 3 # Add leading length indicator stNew = stInsert + stNew # splice in the new string stOut = self.stFron[:self.offset] + stNew + self.stFron[self.offset + length:] # set new offset self.offset += newLen self.stFron = stOut return True # end _put def latin2rtf(self, stIn): stOut = "" for i in range (len(stIn)): s = stIn[i] if ord(s) >= 127: # hex value of latin1 char --> last 2 chars of rtf code hexchars = hex(ord(s)) hexchars = hexchars.replace('0x', '') s = "\\\'" + hexchars elif s == '|': s = '\\par\r\n' stOut += s return RTFPREFIX + stOut + RTFSUFFIX def replace_rtf_codes(self, stIn): accentLoc = stIn.find('\\\'') while accentLoc >= 0: # Get last 2 characters of string \'xx = hex char value rtfCode = stIn[accentLoc + 2:accentLoc + 4] # convert to latin1 character latin1 = chr(int(rtfCode, 16)) if latin1: stIn = stIn.replace(rtfCode, latin1, 1) stIn = stIn.replace("\\'", '', 1) accentLoc = stIn.find('\\\'') stIn = stIn.replace('\\par', '|') stIn = stIn.replace('\\cf1', '') stIn = stIn.replace('\\cf0', '') stIn = re.sub(r'\\lang[0-9]*', '', stIn) stIn = stIn.replace(' ', ' ') # KLUDGE to handle unicode weirdness val = reUnicode.search(stIn) if val: uni = val.group(2) # lop off final character from unicode sequence stIn = stIn.replace(uni, uni[:-1]) return(stIn) def rtf2latin(self, stRtf): if stRtf.find('{\\rtf', 0) != 0: return stRtf if len(stRtf) < 100: return stRtf start = stRtf.find('\\f0\\fs', 0) if start == -1: return stRtf start += 9 end = stRtf.find('\\par\r\n}\r\n', start + 1) if end == -1: return stRtf stOut = stRtf[start:end] stOut = stOut.replace('\\par\r\n', '\n') stOut = stOut.replace('\\{', '{') stOut = stOut.replace('\\}', '}') # KLUDGE to get rid of \langxxxx peculiarity that sometimes shows up # Might want to reinstate this if we can figure out what it means val = reDeLang.search(stOut) if val: lang1234 = val.group(1) stOut = stOut.replace(lang1234, "") stOut = stOut.strip(' \t') stOut = self.replace_rtf_codes(stOut) return stOut def get_composer0(self): if self.subtitle == '': return '' #Leave subtitle intact but mine for composer0 pcont = reParenContents.search(self.subtitle) if pcont: inParen = pcont.group(2) hyphenIndex = inParen.find(' - ') if hyphenIndex != -1: inParen = inParen[hyphenIndex + 3:] else: return '' return inParen def parse_document(self, doc): if doc == "": Fronimo.print_error(self.stFile, "No document") return False val = reGetDate.search(doc) # We have a date if val: self.date = val.group(1) self.date = self.date.strip() val = reApproxDate.search(self.date) if val: caMark = val.group(1) if caMark != "": self.date = self.date.replace(caMark, "") self.date = "c." + self.date val = (re.search('\(\>?c?a?\.? ?[0-9]{4}\), *([fp#%]{1,2}\.* *[^.]*)\.?$', doc, re.DOTALL)) if val: self.page = val.group(1) self.page = self.page.replace(" ", "") else: self.page = "" # this is not really an error, per se. # Fronimo.print_error(self.stFile, "No page # in %s" % doc) else: self.date = "" Fronimo.print_error(self.stFile, "Cannot get date from %s" % doc) # Look for a page anyway (unlikely) val = re.search(', *([fp#%]{1,2}\.* *.*)\.$', doc, re.DOTALL) if val: self.page = val.group(1) else: self.page = "" Fronimo.print_error(self.stFile, "No page # in %s" % doc) # Get document without date and page val = re.search(' ?\(c?a?\.? ?[12][0-9]{3}\)', doc) if val: end = val.span()[0] self.document = doc[:end] else: self.document = doc self.document = self.document.strip() val = re.search("([^,][^,]*), *v[. ] *([^,]+)", self.document) if val == None: self.volume = '0' else: self.document = val.group(1) self.volume = val.group(2) return True #end of parse footnote def slugify(self,stIn): # stIn = stIn.lower() stIn = latin1_to_ascii(stIn) stIn = re.sub(': *', '_', stIn) stIn = stIn.replace("'", '_') stIn = re.sub(' +', '_', stIn) return(stIn) def make_pagedir(self): # document and volume obtained in parse_footnote # and self.source specified in populate if not in parse_footnote # Handle the source field first src = self.source if re.match('[A-Z]+-[A-Z]+[a-z]*', src): #it's a library, so slugify it src = self.slugify(src) else: # It's a proper name: a composer, intabulator, publisher, anthologist, etc. # First check if it is in names list src = src.replace('?', '') src = src.replace('\n', '') found = False for item in self.nameList: if item[1] == src: #slugify it; item[0] contains existing slug src = item[0] #For last name with spaces src = re.sub(' +', '_', src) found = True break if not found: # add to list of unknown names self.write_new_name(src) # So there won't be a knowable facsimile directory # if no knowable source field # So no point in continuing return '' # Now handle document field document = self.slugify(document) facurl = FACHEAD + src + '/' + document # 2 different handlings depending on whether there is a volume if self.volume: # make entry for volume directory, appending date # assumes a document directory doesn't have appended date when there are volumes. # and the volume files carry the date stVol = "v." + self.volume + '_' + self.date pageDir = facurl + '/' + stVol else: # assumes document directory has appended date if no volumes pageDir = facurl + '_' + self.date return pageDir # end of make_pagedir def parse_credits(self, cred): # expand abbreviations [Ee]d. [Ee]nc, and & cred = cred.replace(' & ', ' and ') cred = re.sub('\.$', '', cred) if reEd.search(cred): cred = cred.replace('d.', 'dited', 1) if reEnc.search(cred): cred = cred.replace('nc.', 'ncoded', 1) # Change [IiEe]ntabulated to Encoded val = reEntabulated.search(cred) if val: src = val.group(1) cred = cred.replace(src, "Encoded") #Handle "by", "and", and ";" in credits string val = re.search('[Ee](?:dited|ncoded) and [Ee](?:dited|ncoded) by (..*)$', cred, re.DOTALL) if val: if val.group(1) == 'S.Gerbode': self.editor = self.encoder = 'Sarge Gerbode' else: self.encoder = val.group(1) self.encoder = self.encoder.replace('S.Gerbode','Sarge Gerbode') self.editor = self.encoder return True val = re.search('(E(?:ncoded|dited)) by (..+) ?(?:[;.]|and) ([Ee](?:dited|ncoded)) by (..*)$', cred, re.DOTALL) if val: type1 = val.group(1) cred1 = val.group(2) type2 = val.group(3) cred2 = val.group(4) if type1 == "Edited": self.editor = cred1.strip() self.encoder= cred2.strip() else: self.encoder = cred1.strip() self.editor = cred2.strip() self.editor = self.editor.replace('S.Gerbode', 'Sarge Gerbode') self.encoder = self.encoder.replace('S.Gerbode', 'Sarge Gerbode') return True self.editor = self.encoder = "" return False # End parse_credits # sets source, document, volume, date, page, encoder, editor def parse_footnote(self): if self.footnote == "": return self.source = self.document = self.volume = self.date = self.page = self.encoder = self.editor = "" lsParts = re.split(' +', self.footnote) numParts = len(lsParts) if numParts < 2 or numParts > 3: stErr = "Footnote \"%s\" has wrong # of parts (%d)" % (self.footnote, numParts) Fronimo.print_error(self.stFile, stErr) return False if numParts == 2: # source == ''; later, source will = composer doc, cred = lsParts else: self.source, doc, cred = lsParts # return False if not self.parse_document(doc): Fronimo.print_error(self.stFile, "Cannot parse document:%s" % doc) return False if not self.parse_credits(cred): Fronimo.print_error(self.stFile, "Cannot parse credits: %s." % cred) return True # End parse_footnote def get_diff_val(self, difficulty): diff = difficulty.strip() if diff: diff = difficulty[:3] # convert to lower case diff = diff.lower() else: diff = '3' # Medium is default difficulty value if diff in ['0', '1', '2', '3', '4', '5', '6']: return ord(diff) - 48 elif diff == "beg": return 1 elif diff in ('eas', 'sim'): return 2 elif diff == 'med': return 3 elif diff == 'cha': return 4 elif diff in ('dif', 'har'): return 5 elif diff in ('vir', 'kil'): return 6 else: stErr = "Difficulty value \"%s\" not meaningful" % (difficulty) Fronimo.print_error(self.stFile, stErr) return 0 def parse_info(self): # initialize with existing values global instCount global partCount lsInfo = re.split('\n', self.info) isRemark = False remarks = "" for datum in lsInfo: datum = datum.strip() # Ignore blank lnes if datum == "": continue if isRemark: if remarks: remarks = remarks + '|' + datum else: remarks = datum else: isRemark = (datum.find('--') == 0) if isRemark: # everything after line stating with '--' is a remark continue if datum.find(':') == -1: continue # so it is a field # decode it reItem = reInfo.search(datum) if reItem: field = reItem.group(1) value = reItem.group(2) value = value.strip() field = field.lower() if field in ["tra", "rea", "arr"]: self.arranger = value elif field in ["lib", "sou", "pub" ]: self.source = value elif field in["ins", "ens" ]: self.ensemble = value elif field in ["doc"]: self.document = value elif field in ["ori", "co0"]: self.composer0 = value elif field == "tit": self.title = value elif field == "sub": self.subtitle = value elif field == "com": self.composer = value elif field == "doc": self.document = value elif field == "pag": self.page = value elif field == "edi": self.editor = value elif field == "enc": self.encoder = value elif field == "int": self.intabulator = value elif field == "con": self.concordances = value elif field == "pie": self.piece = value elif field == "fac": self.facurl = value elif field == "rec": self.recurl = value elif field == "sec": self.section = value elif field == "typ": self.type = value elif field == "key": self.key = value # if reKey.search(self.key) == None: # stErr = "Key \"%s\" missing or meaningless" % (self.key) # Fronimo.print_error(self.stFile, stErr) # self.key = "??" elif field == "dif": self.difficulty = self.get_diff_val(value) elif field == "par": self.part = value else: stErr = "Info field \"%s\ ""not found" %(field) Fronimo.print_error(self.stFile, stErr) else: continue self.remarks = remarks # checking to see of all parts list items are in the ensemble list. # first collect all items in the ensemble, including tags ensList = self.ensemble.split(",") insList = [] ensItems = [] for ens in ensList: ens = ens.strip() ens = ens.lower() if ens.find(":") > 0: instag = ens.split(":") # Include tags for part check but not for instrument check ensItems.append(instag[0].strip()) ensItems.append(instag[1].strip()) insList.append(instag[1].strip()) else: insList.append(ens) ensItems.append(ens) # Then check part list item against them if self.part != "": partList = self.part.split(",") for ins in partList: ins = ins.strip() ins = ins.lower() if ins == "score": continue if not ins in ensItems: stErr = "Part \"%s\" not in ensemble list" % (ins) partCount += 1 Fronimo.print_error(self.stFile, stErr) for ins in insList: found = False ins = ins.strip() ins = ins.lower() for item in Fronimo.instList: if item[0] == ins: found = True break if not found: instCount += 1 stErr = "Inst. \"%s\" not found in \"%s\"." % (ins, self.stFile) print(stErr, file=Fronimo.flNewInsts) Fronimo.flNewInsts.flush() # end of for datum in lsInfo return True # end of parse_info def write_new_name(self, newName): global nameCount nameCount += 1 stOut = "%s --> %s" % (newName, self.stFile) print(stOut, file=Fronimo.flNewNames) Fronimo.flNewNames.flush() def check_name(self, name): found = False name = name.replace('?', '') name = name.replace('\n', '') # write out list of names not found in names list val = reSplitAndOr.search(name) if val: lsName = [val.group(1)] + [val.group(2)] elif name: lsName = [name] else: lsName = [] for nm in lsName: found = False for item in Fronimo.nameList: if item[1] == nm: found = True break if not found: self.write_new_name(nm) return False return found # end of check_name def write_new_types(self): # get values from comma-separated list types = self.type.split(",") for typ in types: found = False typ = typ.strip() typ = typ.lower() typ = typ.replace("?", "") for item in Fronimo.typeList: if item[0] == typ: found = True break if not found: global typeCount typeCount += 1 stOut = "%s --> %s" % (typ, self.stFile) print(stOut, file=Fronimo.flNewTypes) Fronimo.flNewTypes.flush() # end of write_new_types # Get 0 padding for pages in the directory. # returns [pad, suffix] def get_pad(self, pageDir): suffix = '' pad = 0 lastPage = 0 #get a list of page file names arPage = os.listdir(pageDir) for page in arPage: suf = page[-4:] suf = suf.lower() if not suf in ['.png', '.tif', '.pdf', '.jpg']: continue suffix = suf page = page[:-4] # strip position on page designation if page[-1] in list('abcdefghijklm'): pagePos = page[-1] page = page[:-1] else: # Get the last page number pagePos = '' # Characters to correct for mispagination in orig. # Like if there are 2 page 8's, second one could be 8s, third one 8t, etc. if page[-1] in ['x', 'y', 'z']: page = page[:-1] if page.isdigit(): currPage = int(page) if currPage > lastPage: lastPage = currPage # file numbers are 0 padded pad = 0 if lastPage >= 100: pad = 3 elif lastPage >= 10: pad = 2 elif lastPage >= 1: pad = 1 return [pad, suffix] #end get_pad # Convert self.page into a valid directory entry def get_page_entry(self, pad): # blank page or page containing % or # guarantees no facsimile file if self.page == '' or re.search('[#%]', self.page) != None: return '' val = re.search('^[fp][fp]*\. *([^,.][^,.]*)', self.page) if val == None: errmsg = "cannot parse page # %s" % (self.page) Fronimo.print_error(self.stFile, errmsg) return '' else: page = val.group(1) # strip position on page designation if page[-1] in list('abcdefghijklm'): lastChar = page[-1] page = page[:-1] else: lastChar = '' if page[-1] == 'v': verso = 'v' page = page[:-1] else: verso = '' #To handle misnumberings, may have to add x, y, or z to page numbers. if page[-1] in ['x', 'y', 'z']: extra = page[-1] page = page[:-1] else: extra = '' # Special case for leading pages like 00a, 000b, etc. # or like a2, l4v, etc. if (not page.isdigit()): return page + extra + verso if pad > 1: page = page.zfill(pad) #Special case for pages like 000a, 000b, etc. if int(page) == 0: page = page + lastChar return(page + extra + verso) # end get_page_entry def make_pagedir(self): # document and volume obtained in parse_footnote # and self.source specified in populate if not in parse_footnote # Handling the source field src = self.source if re.match('[A-Z]+-[A-Z]+[a-z]*', src): # Get rid of apostrophes, colons, and spaces in library name src = self.slugify(src) else: # It's a proper name, a composer, intabulator, publisher, anthologist, etc. # First check if it is in names list found = False for item in self.nameList: if item[1] == src: #slugify it src = item[0] # Handle spaces in the last name src = src.replace(' ', '_') found = True break if not found: # it's not going to have a facurl if src is a compound name # but make sure we pick up any stray names anyway self.check_name(src) return('') # lower case document,get rid of accents and replace spaces and colons with underlines document = self.slugify(self.document) facurl = FACHEAD + src + '/' + document # 2 different handlings if there is a volume if self.volume: # make entry for volume directories, appending date # assumes a document directory doesn't have appended date when there are volumes. stVol = "v." + self.volume + '_' + self.date pageDir = facurl + '/' + stVol else: # assumes document directory has appended date if no volumes pageDir = facurl + '_' + self.date return pageDir # end of make_pagedir #Starts with an uncompressed fronimo string (stFron) and reads values into # an instance of a fronimo class object. def populate(self): global facCount #Assume we can parse footnote unless proven otherwise. fFootnote = True self.offset = self.start_offset pg = self._getBstr() # Skip page number string self.footnote = self._getBstr() self.footnote = self.footnote.strip() if not self.parse_footnote(): Fronimo.print_error(self.stFile, "Cannot parse footnote:\n%s" % self.footnote) fFootnote = False newPos = self.stFron.find('CPiece') # This is very unlikely to happen if newPos == -1: Fronimo.print_error(self.stFile, "Cannot find pattern 'CPiece'") return False self.offset = newPos + 14 self.title = self._getBstr() self.title = self.title.strip(' \t') self.title = self.rtf2latin(self.title) self.subtitle = self._getBstr() self.subtitle = self.subtitle.strip('[\t ]') self.subtitle = self.rtf2latin(self.subtitle) self.subtitle = self.subtitle.replace('\n', '|') self.composer0 = self.get_composer0() self.composer = self._getBstr() self.composer = self.composer.strip('[\t ]') self.composer = self.rtf2latin(self.composer) if self.source == "": self.source = self.composer.replace('?', '') if self.source == 'Anonymous': self.source = 'Unknown' self._getBstr() # Discard text at beginning and end of section self._getBstr() self.info = self._getBstr() self.info = self.info.strip() self.info = self.info.replace ('\r', '') # Skip key field; will get key from info field # offset = stFron.find('CBAR',offset) + 10 # keyNum = self._getWord(True) self.composer = self.composer.replace('Anon.', 'Anonymous') self.composer0 = self.composer0.replace('Anon.', 'Anonymous') return True # End Populate def main(): flBadSigs = open('badsigs.txt', "w", encoding = 'latin1') if not flBadSigs: print("Cannot open bad signatures file.") return False count = 0 oldDir = "" lsAll = [] # Walk the main directory tree for root, dirs, files in os.walk(CURRDIR): # Only when changing root directories do we collect a list of .ft3 files root = root.replace("\\", '/') if root == oldDir: continue # Now we have a new "old directory" oldDir = root # Won't find any .ft3 files in directory trees starting with BADDIRS # so don't process these for target in BADDIRS: result = root.find("/" + target) if result != -1: break if result != -1: continue # Get a list of .ft3 files in this directory ft3s = [f for f in files if f.endswith('.ft3')] # Process each .ft3 file in the list. for fname in ft3s: # collects info from fronimo file, formats it, and outputs it. # open and unzip fronimo file fron = Fronimo(root + '/' + fname) if fron.start_offset <= 0: # means file not processed correctly print(fron.stFile + ' ' + 'damaged.') continue signature = [fron.source,fron.document,fron.volume,fron.page] other = [fron.composer.rstrip(), fron.composer0.rstrip(), fron.title.rstrip()] lsValues = [signature, other, fron.stFile] lsAll.append(lsValues) # end for fname in ft3s # end if root != oldDir # After walking the directories. lsAll.sort() for i in range(len(lsAll)): if i > 0: # if 2 consecutive signatures match if lsAll[i][0] == lsAll[i-1][0]: # And other data doesn't match # Allowing for added numberings currTitle = lsAll[i][1][2] lastTitle = lsAll[i-1][1][2] currTitle = re.sub("^[0-9 .]+", "", currTitle) currTitle = re.sub("[0-9 ]+$", "", currTitle) lastTitle = re.sub("^[0-9 .]+", "", lastTitle) lastTitle = re.sub("[0-9 ]+$", "", lastTitle) currTitle = currTitle.lower() lastTitle = lastTitle.lower() # Substitute in titles without numbers lsAll[i][1][2] = currTitle lsAll[i-1][1][2] = lastTitle # Now check if equal if lsAll[i][1] != lsAll[i-1][1]: location = f"{lsAll[i][0][0]}|{lsAll[i][0][1]}|{lsAll[i][0][2]}|{lsAll[i][0][3]}|" print (location, file=flBadSigs) print('in', lsAll[i-1][2], 'and', file=flBadSigs) print(f"{lsAll[i][2]}", file=flBadSigs) print(f"{lsAll[i-1][1]}!=", file=flBadSigs) print(f"{lsAll[i][1]}", file=flBadSigs) print('--------------', file=flBadSigs) count += 1 print(str(count) + ' ' + 'bad/duplicate signatures.', file=flBadSigs) print(str(count) + ' ' + 'bad/duplicate signatures.', file=sys.stderr) # end main if __name__ == "__main__": main()