#!/usr/bin/python3.4 from pprint import pprint, pformat from difflib import SequenceMatcher import os import csv import re import sys #import distance import collections from unicode import latin1_to_ascii from strings import IncrementString MAXINT = 2**31 # KLUDGE DIR_DATA = "." # KLUDGE """ Definitions: A Piece() is an abstract concept that encompasses all concrete realizations (Settings()) of a particular piece of music. E.g. Greensleeves is an original tune that has been arranged in many different ways and in many different keys. These individual arrangements are called Settings() of the Piece() called Greensleeves. A Part() represents a single .ft3 file and contains information extracted from it by Sarge's dft.pl program. It may either be for a single instrument or voice, or for several, or even be the entire score of a Setting(), but it is always derived from a single file. A Section() is composed of one or more Parts() for an identified sub-portion of a Setting(). A Section() is analogous to a movement in a symphony or other large work. Parts() are identified as being associated with a Section() using meta data in the file. A Setting() is composed of either (but not both): 1. one or more Parts(), or 2. one or more Sections(). If there are Sections *and* Parts, then this is an editorial error that is flagged for correction. The dft.tsv file represents all information extracted (by dft.pl) from all .ft3 files in the collection. A certain amount of normalization has already been accomplished by dft.pl. The routine Setting.create_from_csv() consumes the dft.tsv file and generates a Part() for each row. A Part's signature is composed of the tuple (Source, Document, Page, Date) and is used to group Parts() into Settings(). I.e all Parts() with the same signature are *by definition* in the same Setting(). Once all Parts() have been created and grouped, each Setting() has its setting.analyze_sections() routine invoked to discover if there are Sections() that can be inferred from information common amongst various Parts() in the Setting(). Eventually there will be the assignment of Settings() to Pieces(), but the exact nature of discovering Setting() >> Piece() is still in flux. More than likely it will be accomplished via one or more META files. Invariants: 1. All parts in a setting have the title 2. All part instrument lists must be in same order as ensemble field In the file name: _P = Performance _O = Ornamented score is in part.part, should be sole value and will not show up in ensemble Composer detail need exception sort for some composers type title Insturments 14 course and also handle smaller number of courses if lute pitch is specified, smaller courses must have same pitch Unspecified pitch == G for lutes and archlutes Unspecified pitch == A for theorbo """ class bcolors: HEADER = '\033[95m' OKBLUE = '\033[94m' OKGREEN = '\033[92m' WARNING = '\033[93m' FAIL = '\033[91m' ENDC = '\033[0m' BOLD = '\033[1m' UNDERLINE = '\033[4m' def gen_csv(csv_file, delimiter="\t", use_dict=False, clean_empty_end=False): with open(csv_file, encoding="ISO-8859-1") as csvfile: if use_dict: reader = csv.DictReader(csvfile, delimiter=delimiter) else: reader = csv.reader(csvfile, delimiter=delimiter) for line in reader: # eat empty trailing fields if clean_empty_end: while True: x = line.pop() if x == '': continue line.append(x) break yield line # with # gen_csv def ForceUtf8(strIn): strOut = strIn try: strOut = strOut.decode('latin1') except: pass strOut = strOut.encode(encoding="UTF-8") return strOut # ForceUtf8 reSep = re.compile(r'[.,]\s*') reSpaces = re.compile(r"\s\s+") reNum = re.compile(r"\s*\d+$") reOther = re.compile(r"\s*\|\s*(duet|ground|treble)") """ def GetInstruments(): # Prepopulate the Instruments dictionary. for i in Instrument.objects.all(): mpNameInst[i.name] = i # GetInstruments GetInstruments() """ def str_homogenize(str_in): tmp = str_in.lower() # force to lower tmp = re.sub(r'-', ' ', tmp) # all white space goes to one space tmp = re.sub(r'\s+', ' ', tmp) # all white space goes to one space tmp = latin1_to_ascii(tmp) # replace any accent characters return tmp # str_homogenize def str_similarity(a, b, typo_min): ratio = SequenceMatcher(None, a, b).ratio() if ratio > typo_min: # might be a typo a_tmp = a.lower() b_tmp = b.lower() if a_tmp == b_tmp: return "CAPS " a_tmp = re.sub(r'^\s+', '', a_tmp) b_tmp = re.sub(r'^\s+', '', b_tmp) if a_tmp == b_tmp: return "LEAD SPC" a_tmp = re.sub(r'\s+', ' ', a_tmp) b_tmp = re.sub(r'\s+', ' ', b_tmp) if a_tmp == b_tmp: return "INT SPC" hyph = r'-+\s+' a_tmp = re.sub(hyph, r'', a_tmp) b_tmp = re.sub(hyph, r'', b_tmp) if a_tmp == b_tmp: return "HYPHEN " punct = r'["\'<>.,;:!?$%#@()&^\[\]{}-]+' a_tmp = re.sub(punct, r'', a_tmp) b_tmp = re.sub(punct, r'', b_tmp) if a_tmp == b_tmp: return "PUNCT " a_tmp = latin1_to_ascii(a_tmp) b_tmp = latin1_to_ascii(b_tmp) if a_tmp == b_tmp: return "ACCENTS" a_tmp = re.sub(r'^\s*([0-9]+[a-z]?\.?\s+)?', r'', a_tmp) b_tmp = re.sub(r'^\s*([0-9]+[a-z]?\.?\s+)?', r'', b_tmp) if a_tmp == b_tmp: return "LEAD NM" a_tmp = re.sub(r'([0-9]+[a-z]?\.?\s+)', r'', a_tmp) b_tmp = re.sub(r'([0-9]+[a-z]?\.?\s+)', r'', b_tmp) if a_tmp == b_tmp: return "INT NUM" if len(a) != len(b): return "LENGTH " return ratio # str_similarity def Indent(level=0): return " " * level def EnsembleEncode(ensemble): insts = ensemble.split(",") return None def CmpEnsemble(a, b): return a def is_iterable(obj): try: tmp = iter(obj) return True except TypeError: return False # is_iterable class LuteMixIn(object): def __init__(self, *args, **kwargs): self._errors = [] # to accumulate discovered errors for editorial views def save(self): pass @classmethod def get(kls, key, fail=True): if key in kls._OBJECTS or fail: return kls._OBJECTS[key] # will fail if no such key return None @classmethod def add(kls, key, obj): # what about overwrites? kls._OBJECTS[key] = obj #kls.add(key, obj) @classmethod def get_or_create(kls, key, *args, **kwargs): created = False obj = kls.get(key, fail=False) if not obj: save = commit = False if 'save' in kwargs: save = kwargs['save'] del kwargs['save'] if 'commit' in kwargs: save = kwargs['commit'] del kwargs['commit'] obj = kls(key, *args, **kwargs) created = True #print(key, obj, save, commit) if save: #obj.save() kls.add(key, obj) if commit: #transaction.commit() pass return (obj, created) @classmethod def all(kls, sorted=False): if sorted: ret = [(k,v) for k,v in kls._OBJECTS.items()] ret.sort(key=lambda a: a[0]) return ret else: return kls._OBJECTS def map_fields(self, fieldsIn): #fields_out = [] #for f in pass def dump_fields(self, indent=0, depth=MAXINT, keys='*', ignore_empty=False, strip_newlines=True, join_lines="\n", join_fields=None, max_width=MAXINT, field_fmt="%-15s: %s", error_fmt="%-15s: %s", #error_fmt=bcolors.FAIL+"%-15s: %s"+bcolors.ENDC, errors_only=False): # Dump the selected fields from the object #print(errors_only, str(self)) if hasattr(self, "_errors"): errors = self._errors else: errors = [] if errors_only: if len(errors) == 0: return None fields = [] mpKeyValue = vars(self) vkeys = sorted(mpKeyValue.keys()) abbrev_map = None if hasattr(self, 'abbrev_map'): abbrev_map = self.abbrev_map print_errors = False if 'errors' in keys: print_errors = True keys.remove('errors') if not keys or '*' in keys: if hasattr(self, 'field_name_map'): keys = [f for f in self.field_name_map.values()] else: keys = vkeys field_name_reverse_map = None if hasattr(self, 'field_name_reverse_map'): field_name_reverse_map = self.field_name_reverse_map for k in keys: if abbrev_map: if k not in vkeys and k in abbrev_map: k = abbrev_map[k] val = None if k in vkeys: val = mpKeyValue[k] #print(k, val) if val is None or (is_iterable(val) and len(val) == 0): if not ignore_empty: val = " " else: continue val = str(val) #print(k); pprint(val) if strip_newlines and ("\n" in val): val = re.sub(r'\s*\n\*', ' ', val) #print(field_fmt) if field_name_reverse_map: k = field_name_reverse_map[k] #print(k, val) fields.append(field_fmt % (k,val)) elif error_fmt: errors.append("Bad field name: %s" % k) # for all keys if print_errors and len(errors) > 0: fields.insert(0, error_fmt % ("ERRORS", str(errors))) if join_fields: # we are returning a single line, so only do Indent at beginning ret = Indent(indent) + join_fields.join(fields) if len(ret) > max_width: ret = "%.*s..." % (max_width-3, ret) return ret else: ret = [] for f in fields: tmp = Indent(indent) + f if len(tmp) > max_width: tmp = "%.*s..." % (max_width-3, tmp) ret.append(tmp) return join_lines.join(ret) # dump_fields def add_error(self, error): self._errors.append(error) def fmt_errors(self, fmt=None, indent=0, header=True): if not self._errors: return '' if not fmt: ret = [] if header: ret.append(Indent(indent) + "ERRORS:") for e in self._errors: ret.append(Indent(indent+1) + e) return "\n".join(ret) assert("Unknown error format type") # fmt_errors # class LuteMixIn class Difficulty(LuteMixIn): _OBJECTS = {} @classmethod def create_difficulties(): diffs = { u"Beginner": 10, u"Easy": 20, u"Medium": 30, u"Challenge": 40, u"Difficult": 50, u"Virtuoso": 60, } for k,v in diffs.items(): diff = Difficulty(name=k, level=v) diff.save() # CreateDifficulties # class Difficulty class Key(LuteMixIn): @classmethod def create_key_list(kls): key_order = [ u"AM", u"Am", u"A#M", u"A#m", u"AbM", u"Abm", u"BM", u"Bm", u"B#M", u"B#m", u"BbM", u"Bbm", u"CM", u"Cm", u"C#M", u"C#m", u"CbM", u"Cbm", u"DM", u"Dm", u"D#M", u"D#m", u"DbM", u"Dbm", u"EM", u"Em", u"E#M", u"E#m", u"EbM", u"Ebm", u"FM", u"Fm", u"F#M", u"F#m", u"FbM", u"Fbm", u"GM", u"Gm", u"G#M", u"G#m", u"GbM", u"Gbm", ] keys = { 'Ab': u"A"+flat, 'AbM': u"A"+flat, 'Am': u"A minor", 'AM': u"A", 'Bbm': u"B"+flat+" minor", 'BbM': u"B"+flat, 'Bm': u"B minor", 'BM': u"B", 'Cm': u"B minor", 'CM': u"C", 'Dbm': u"D"+flat+" minor", 'DbM': u"D"+flat+" minor", 'Dm': u"D minor", 'DM': u"D", 'D major': u"D", 'Ebm': u"E"+flat+" minor", 'EbM': u"E"+flat, 'Em': u"E minor", 'EM': u"E", 'Fm': u"F minor", 'FM': u"F", 'Gm': u"G minor", 'GM': u"G", } # where did this come from??? # 'GM A minor, BbM for Bb major, C for k,v in keys.items(): key = Key(name=v, csv_name=k) key.save() # create_key_list # class Key class MusicType(LuteMixIn): _OBJECTS = {} def __init__(self, name, official=False, parent=None): self.name = name self.official = official self.parent=parent @classmethod def create_from_typemap(kls, typemap_file): # This is a bit KLUDGEy lines = [] with open(typemap_file, encoding="ISO-8859-1") as typemap: for line in typemap: if re.match(r'\s*#', line) or re.match(r"^\s*$", line): continue while re.search(r'\s*\\\s*$', line): line = re.sub(r'\s*\\\s*$', r'', line) tmp = next(typemap) line += tmp.strip(" \t") line = re.sub(r'[,\s]*$', r'', line) lines.append(line) #with #for l in lines: print(l) #1/0 cat = canon = None for line in lines: #print(line) if re.match(r'\s*#', line): continue line = re.sub(r",\s", r',', line) if line.startswith("\t\t"): line = re.sub(r"^[\s\t]*", r'', line) #print("Dip:", line) dips = line.split(',') for dip_name in dips: (diplomatic, created) = kls.get_or_create(dip_name, official=True, parent=canon, save=True, commit=True) elif line.startswith("\t"): line = re.sub(r"^[\s\t]*", r'', line) #print("Can:", line) canon = None canons = line.split(',') for canon_name in canons: (tmp, created) = kls.get_or_create(canon_name, official=True, parent=cat, save=True, commit=True) if canon == None: canon = tmp else: line = re.sub(r"^[\s\t]*", r'', line) #print("Cat:", line) cat = None cats = line.split(',') for cat_name in cats: (tmp, created) = kls.get_or_create(cat_name, official=True, save=True, commit=True) if cat == None: cat = tmp # for each line # with #kls.dump_tree() # create_from_typemap @classmethod def dump_tree(kls): all = kls.all() for name,mt in all.items(): print("[%s] [%s]" % (name, mt.name)) return tlts = [v for k,v in all.items() if v.parent == None] for tlt in tlts: print("Top: ", tlt.name) canons = [v for k,v in all.items() if v.parent == tlt] for c in canons: print(Indent(1), "Canonnical:", c.name) ps = [v for k,v in all.items() if v.parent == c] if ps: print(Indent(2), "Diplomatics:") for p in ps: print(Indent(3), p.name) # dump_tree def dump(self): tmp = [self.name] if self.parent: tmp.append("parent: %s" % self.parent.name) #if self.official: tmp.append("official: True") return "\n\t".join(tmp) # class MusicType class Instrument(): mpNameInst = {} _id = 0 def __init__(self, name, fullname, order): self.name = name self.fullname = fullname self.order = order #name = models.CharField(max_length=254) #slug = models.CharField(max_length=254) #fullname = models.CharField(max_length=254) #order = models.CharField(max_length=64, default="ZBOGUS") @classmethod def add(kls, inst=None, name=None, fullname=None, order=None): if not inst: if not order: order = "ZZZZZBOGUS" inst = kls(name=name, fullname=fullname, order=order); kls._id += 1 inst.id = kls._id #inst.save() kls.mpNameInst[inst.name] = inst return inst @classmethod def create_from_file(kls, filename): # fleshes out insturment list from INSTRUMENT_ORDER inst_order_file = filename #instruments = [ForceUtf8(i) for i in open(inst_order_file).read().splitlines()] instruments = [i for i in open(inst_order_file).read().splitlines()] order = 1 for name in instruments: if name == '': continue #print(type(name)) #print(name) if name.startswith('#='): # This line sets the ordering to a new value # This allows us to leave gaps for future instruments order = name[2:] continue if name.startswith('#'): continue # strip EOL comments name = re.sub(r'#.*$', '', name) if '|' in name: name, fullname = name.split(' | ') else: fullname = name inst = kls.add(name=name, fullname=fullname, order=order) order = IncrementString(order) #print "%s (%s) %s" % (fullname, name, order) if False: # for all lines in the instruments file for i in sorted(kls.mpNameInst.values(), key=lambda a: a.order): print("%s [%s]" % (i, i.order)) # createInstruments @classmethod def order_from_instruments(kls, instsIn): # Single instrument ordering is based on scanner/INSTRUMENTS.alpha # Groups (including group of 1) have their instrument count # prepended to a concatenation of the individual instrument # ordering codes. Mostly, this puts single insturments at the # top and large groups of multiple insturments at the bottom, # but the score is always last. num_insts = ord('0') ch_num_insts = None tmp = [] #pprint(instsIn); #1/0 for i in instsIn: if i: if not i.name in kls.mpNameInst: # unknown insturment! inst = Instrument.add(name=i.name, fullname=i.name) #inst.save() kls.mpNameInst[i.name] = inst order = kls.mpNameInst[i.name].order if order[0] in '0123456789': # insturment (probably the score) has its own number if ch_num_insts is None or order[0] > ch_num_insts: ch_num_insts = order[0] order = order[1:] # clip it off tmp.append(order) tmp.sort() if ch_num_insts is None: ch_num_insts = chr(ord('0') + len(tmp)) # yields '1' through '9' order = ch_num_insts + "".join(tmp) #print(order) return order # orderFromInstruments @classmethod def clean_instrument(kls, sIn): sOut = sIn sOut.strip() sOut = sOut.lower() sOut = re.sub(r"( [a-z]\b)", lambda pat: pat.group(0).upper(), sOut) sOut = reSpaces.sub(' ', sOut) sOut = reNum.sub('', sOut) sOut = reOther.sub('', sOut) return sOut # clean_instrument @classmethod def get_instruments(kls, sIn): tmp = reSep.split(sIn) ret = [] for i in tmp: iname = kls.clean_instrument(i) if iname in kls.mpNameInst: inst = kls.mpNameInst[iname] else: #(inst, created) = kls.objects.get_or_create(iname) created = True inst = kls.add(name=iname) if created: inst.fullname = iname inst.order = "ZZBOGUS" + str(inst.id) #inst.save() kls.mpNameInst[inst.name] = inst ret.append(inst) #ret.sort(lambda a,b: cmp(a.order, b.order)) ret.sort(key=lambda a: a.order) #pprint(ret) return ret # get_instruments def __repr__(self): if self.name == self.fullname: return self.name else: return "%s [%s]" % (self.name, self.fullname) #class Instrument class Country(LuteMixIn): _OBJECTS = {} def __init__(self, name): self.name = name def __str__(self): return self.name # class Country class ComposerAlias(LuteMixIn): # NOTE: potential m2m _OBJECTS = {} def __init__(self, name, composer): self.name = name self.composer = composer def __str__(self): return self.name # class ComposerAlias class Composer(LuteMixIn): # NOTE: potential m2m _OBJECTS = {} #full_name = models.CharField(max_length=254) #first_name = models.CharField(max_length=254) #last_name = models.CharField(max_length=254) #sort_name = models.CharField(max_length=254, blank=True) # this is the field we sort on #suspicious = models.BooleanField(default=False) #aliases = models.ManyToManyField(ComposerAlias, blank=True) #country = models.ForeignKey(Country, blank=True, null=True) #country_origin = models.ForeignKey(Country, blank=True, related_name="%(app_label)s_%(class)s_related_origin", null=True) #date_of_birth = models.DateField(blank=True, null=True) #date_of_death = models.DateField(blank=True, null=True) #url_info = models.CharField(max_length=254, blank=True) #html = models.TextField(blank=True) def __init__(self, full_name=None, first_name=None, last_name=None, country=None, country_origin=None): self.full_name = full_name self.last_name = last_name self.sort_name = latin1_to_ascii(self.last_name) self.first_name = first_name self.country = country self.country_origin = country_origin if self.country: (self.country, created) = Country.get_or_create(self.country) if self.country_origin: (self.country, created_origin) = Country.get_or_create(self.country_origin) self.aliases = [] # __init__ @classmethod def create_from_file(kls, csv_file): with open(csv_file, encoding="ISO-8859-1") as csvfile: reader = csv.reader(csvfile, delimiter="\t") for (i, cdata) in enumerate(reader): #print(i,cdata) # If more than 3 fields, the non-empty ones are aliases aliases = [x for x in cdata[3:] if x != ''] full_name = str(cdata[0]) if full_name == "Composer" and last_first == "Last, First": # this is a bogus header line left over from the Reagan era. continue last_first = str(cdata[1]) country_name = 'Unknown' if len(cdata) > 2: country_name = str(cdata[2]) tmp = [x for x in re.split(r'\s*,\s*', last_first)] first = '' last = tmp[0] if len(tmp) > 1: first = tmp[1] comp = Composer(full_name=full_name, first_name=first, last_name=last, country=country_name, country_origin=country_name) #comp.save() # so we have id for m2m kls.add(comp.full_name, comp) for a in aliases: if a: ca = ComposerAlias(a, comp) #ca.save() comp.aliases.append(ca) #for a in comp.aliases: print(a.name) #comp.save() # with # CreateComposers def save(self, *args, **kwargs): if not self.sort_name: self.sort_name = self.last_name super(Composer, self).save(*args, **kwargs) def name(self): name = '' if self.first_name: if self.last_name: name = "%s %s" % (self.first_name, self.last_name) else: name = self.first_name elif self.last_name: name = self.last_name else: name = 'No Name' return smart_unicode(name) # name def __unicode__(self): return self.name() @classmethod def composer_list(cls): comp_all = Composer.objects.order_by('sort_name') tmp = [] comp_ids = {} for c in comp_all: if not c.piece_set.count(): continue if c.sort_name: c.ch_sort = c.sort_name[0].upper() tmp.append(c) alias = c.aliases.all() for a in alias: if a.sort_name: a.ch_sort = a.sort_name[0].upper() a.composer = c tmp.append(a) # this sort is spurious #tmp.sort(lambda a,b: cmp(a.ch_sort, b.ch_sort)) no_name = [] mpChComposer = {} aliases = {} for c in tmp: if not c.ch_sort in mpChComposer: mpChComposer[c.ch_sort] = [] mpChComposer[c.ch_sort].append(c) # Finally we sort the sort characters, and then each list under that char keys = mpChComposer.keys() keys.sort() comp_list = [] for ch in keys: mpChComposer[ch].sort(lambda a,b: cmp(a.sort_name, b.sort_name)) comp_list.append( (ch, mpChComposer[ch]) ) return comp_list # composer_list def dump(self): print("[%s] [%s] [%s] [%s] [%s]" % (self.full_name, self.first_name, self.last_name, self.sort_name, self.country.name)) if self.aliases: for a in self.aliases: print(Indent(1), str(a)) @classmethod def dump_all(kls): for (k,v) in kls.all(sorted=True): v.dump() def __str__(self): return self.full_name class Meta: ordering = ['last_name'] # class Composer class Piece(LuteMixIn): _OBJECTS = {} _sorted = False # Signature: (??) title || composer def __init__(self, settings=None): super().__init__() self.settings = settings Piece.add(self) def add_setting(self, setting): if not self.settings: self.settings = [] self.settings.append(setting) def dump(self, indent=0, depth=MAXINT, flags="", extra_flags=""): 1/0 # class Piece class Part(LuteMixIn): _OBJECTS = {} _sorted = False abbrev_map = { "ar": "arranger", "c0": "composer0", "cm": "composer", "cn": "contributor", "da": "date", "di": "difficulty", "do": "document", "ed": "editor", "ec": "encoder", "es": "ensemble", "fn": "footnote", "ff": "fronimo_file", "in": "intabulator", "insts": "instruments", "ky": "key", "md": "midi_file", "os": "orig_sub", "pg": "page", "pt": "part", "pf": "pdf_file", "pi": "piece", "pu": "publisher", "rm": "remarks", "sc": "section", "su": "subtitle", "ti": "title", "ty": "type", } field_name_map = collections.OrderedDict([ ("Piece", "piece"), ("Section", "section"), ("Part", "part"), ("Ensemble", "ensemble"), ("Title", "title"), ("Orig. subtitle", "orig_sub"), ("Subtitle", "subtitle"), ("Type", "type"), ("Key", "key"), ("Difficulty", "difficulty"), ("Composer", "composer"), ("Orig. composer", "composer0"), ("Publisher", "publisher"), ("Document", "document"), ("Page", "page"), ("Editor", "editor"), ("Encoder", "encoder"), ("Arranger", "arranger"), ("Intabulator", "intabulator"), ("Contributor", "contributor"), ("Remarks", "remarks"), ("Footnote", "footnote"), ("Date", "date"), ("Fronimo", "fronimo_file"), ("PDF", "pdf_file"), ("Midi", "midi_file"), ]) field_name_reverse_map = None def __init__(self, attrs): super().__init__() """Convert the dictionary into instance attributes.""" for k, v in attrs.items(): field_name = self.field_name_map[k] setattr(self, field_name, v) # For pretty printing if not Part.field_name_reverse_map: Part.field_name_reverse_map = dict( [(v,k) for k,v in Part.field_name_map.items()] ) # The field "Piece" is from an earlier time and does **not** equate # with the curent meaning of Piece as an abstraction including all # Settings() that realize a Piece of music. if self.title == '': self.title = self.piece if self.part == '': self.part = self.ensemble """ Archlute => archlute """ self.ornamented = (re.search(r'^.*_[^/]*O[^/a-z]*\.ft3$', self.fronimo_file) is not None) self.performance = (re.search(r'^.*_[^/]*P[^/a-z]*\.ft3$', self.fronimo_file) is not None) self.set_instruments() self.section_order = 0 if self.section: if self.section == "*": self.section_order = -1 self.section = 'Complete' else: num = re.sub(r'^\s*(([0-9]+)\.)?\s+.*$', r'\2', self.section) try: if num: #print("Num: %s" % sir(num)) self.section_order = int(num) else: self.add_error("Section name has no number: %s" % self.section) except: self.add_error("Bad Section name: %s" % self.section) self.fronimo_file = re.sub(r"\\", "/", self.fronimo_file) self.midi_file = re.sub(r"\\", "/", self.midi_file) self.pdf_file = re.sub(r"\\", "/", self.pdf_file) self.fronimo_file = re.sub(r"C:/website/", "", self.fronimo_file) self.midi_file = re.sub(r"C:/website/", "", self.midi_file) self.pdf_file = re.sub(r"C:/website/", "", self.pdf_file) self.fronimo_file = re.sub(r"http://gerbode.net/", "", self.fronimo_file) self.midi_file = re.sub(r"http://gerbode.net/", "", self.midi_file) self.pdf_file = re.sub(r"http://gerbode.net/", "", self.pdf_file) # Build the signature for grouping related files/parts together # The signature wants to: # a: be as unique as possible # b: without accidently excluding things that belong together # After a buncch of playing around it was determined that the # "best" signature was based on the originating doccument + the date. self.signature = "%s / %s / %s / %s" % (self.publisher, self.document, self.page, self.date) # Now establish our foreign keys if self.composer == '': self.composer = self.composer0 if self.composer: (self.composer, created) = Composer.get_or_create(self.composer) if created: # this should not happen! pass if self.composer0: (self.composer0, created) = Composer.get_or_create(self.composer0) if created: # this should not happen! pass """ if self.type: (self.type, created) = MusicType.get_or_create(self.type) if created: # this should not happen! pass """ # __init__ def set_instruments(self): self.instruments = Instrument.get_instruments(self.part) self.order = Instrument.order_from_instruments(self.instruments) # sanity check that instruments is a subest of ensemble if self.instruments[0].fullname == 'score': return ensem = dict((e.fullname,1) for e in Instrument.get_instruments(self.ensemble)) insts = dict((i.fullname,1) for i in self.instruments) #print("parts: ", list(insts.keys()), "ensemble: ", list(ensem.keys())) for i in self.instruments: #print(i.fullname, ensem) if i.fullname not in ensem: self.add_error("'%s' in part but not in ensemble" % i.fullname) # set_instruments def ensemble_encode(self): self.enemble_sort = EnsembleEncode(self.insts) def get_title(self): return self.title def get_long_title(self): t = self.title if not t: t = '--' s = self.subtitle if not s: s = '--' sect = self.section if not sect: sect = '--' ret = "%s / %s / %s" % (t, s, sect) return ret def dump(self, indent=0, depth=MAXINT, flags="es ff", extra_flags="", ignore_empty=False, errors_only=False): depth -= 1 if depth <=0: return '' if extra_flags: flags += " " + extra_flags fields = flags.split() keys = self.field_name_map.values() ret = '' ret = self.dump_fields(keys=fields, indent=indent, depth=depth, ignore_empty=ignore_empty, errors_only=errors_only) #print(ret); 1/0 return ret if False: if print_section: return "%s%s || %s (%s)" % (Indent(indent), self.ensemble, self.fronimo_file, self.section) #return "%s%s (%d || %s || %s)" % (Indent(indent), self.fronimo_file, self.section_order, self.section, self.subtitle) else: return "%s%s || %s" % (Indent(indent), self.ensemble, self.fronimo_file) # dump() # class Part class Section( LuteMixIn): def __init__(self, parts=None): super().__init__() self.parts = parts self.title = None self.section_order = 0 def set_title(self): if self.title: return # KLUDGE if self.parts: self.title = self.parts[0].section def add(self, part): if not self.parts: self.parts = [] self.parts.append(part) self.parts.sort(CmpEnsemble) self.section_order = self.parts[0].section_order def dump(self, indent=0, depth=MAXINT, flags="", extra_flags="", ignore_empty=False, errors_only=False): ret = [] #ret.append("%sSection: %s:" % (Indent(indent), self.title)) depth -= 1 if depth > 0: for p in self.parts: tmp = p.dump(indent+1, depth, flags=flags, extra_flags=extra_flags, ignore_empty=ignore_empty, errors_only=errors_only) if tmp: # if errors_only is true, then tmp will be None if no errors ret.append(tmp) if errors_only and not ret: return None ret.insert(0, "%s%s:" % (Indent(indent), self.title)) return "\n".join(ret) # class Section class Setting(LuteMixIn): _OBJECTS = {} # KLUDGE! This is our "database" for the moment _sorted = False def __init__(self, signature, parts=None, sections=None): super().__init__() self.title = None self.signature = signature self.parts = parts self.sections = sections self.set_title() def set_title(self): if self.title: return # KLUDGE parts = self.all_parts() if parts: self.title = parts[0].title self.ensemble = parts[0].ensemble def add_section(self, section): if not self.sections: self.sections = [] self.sections.append(section) section.set_title() def add_part(self, part): if not self.parts: self.parts = [] self.parts.append(obj) def analyze_parts(self): return for p in self.all_parts(): if p._errors: self.add_error("%s || %s" % (p.fmt_errors(header=False), p.fronimo_file)) def analyze_sections(self): # Here we try to find if there is 1 or more Sections # and if so move the Parts into the Sections sects = collections.defaultdict(list) _parts = self.parts self.parts = [] for p in _parts: if p.section_order: sects[p.section_order].append(p) else: self.parts.append(p) if len(sects) > 0: if len(self.parts) > 0: self.add_error("Setting has both Sections and unassociated Parts:\n\t%s (%s)" % (self.title, self.signature)) #pprint(sects) for k,parts in sects.items(): section = Section(parts) self.add_section(section) if section._errors: self.add_error("%s || %s" % (section.title, section.fmt_errors(header=False))) # analyze_sections def analyze(self): self.analyze_parts() self.analyze_sections() def all_parts(self): parts = [] # Note: the following allows for there to be both # Sections() with parts *and* unassociated parts, # a situation that should never happen. if self.sections: for s in self.sections: parts.extend(s.parts) if self.parts: parts.extend(self.parts) return parts def titles(self, min_len=1, clean_cruft=False): # return dict of (title, parts[]) titles = collections.defaultdict(list) if self.sections: for s in self.sections: s_titles = collections.defaultdict(list) for p in s.parts: s_titles[p.title].append(p) if len(s_titles.keys()) > 1: for st,ps in s_titles.items(): titles[st] = s_titles[st] if self.parts: for p in self.parts: titles[p.title].append(p) if clean_cruft: tmp = collections.defaultdict(list) # Two special cases where titles that are not perfect matches # are consider to be a match: # 1. Title starts with 18., 18a., 18b., etc. and all can reduce to 18. # 2. Differ only in that they have a trailing (parenthetical note) for title in sorted(titles.keys()): # Normalize leading number variants t = re.sub(r'^\s*([0-9]+)[a-z]*\.(.*)$', r'\1. \2', title) # Remove trailing parens t = re.sub(r'\s*\(.*\)\s*$', r'', t) tmp[t].append(title) # a list of titles that maps to this simpler title #print("%s\t%s" % (t, title)) # for #pprint(tmp); """ ret = {} for t,parts in tmp.items(): if len(parts) > 1: pass """ # remove all but one of these simpler-but-not-the-same title from the list if len(tmp.keys()) < 2: # then all titles were *essentially* the same return None # number & paren removal if len(titles.keys()) < min_len: return None return titles # titles def ensembles(self): # return dict of (ensemble_name, parts[]) ret = collections.defaultdict(list) for p in self.all_parts(): ret[p.ensemble].append(p) return ret def dump(self, indent=0, depth=MAXINT, flags="", extra_flags="", ignore_empty=False, errors_only=False): ret = [] depth -= 1 if depth > 0: if self.sections: self.sections.sort(key = lambda x: x.section_order) for s in self.sections: tmp = s.dump(indent=indent+2, depth=depth, flags=flags, extra_flags=extra_flags, ignore_empty=ignore_empty, errors_only=errors_only) if tmp: ret.append(tmp) if errors_only and len(ret) > 0: ret.insert(0, "%sSections:" % Indent(indent+1)) if self.parts: parts = [] if self.sections: if 'sc' not in (flags + extra_flags): extra_flags += " sc" max = len(self.parts) for p in self.parts: tmp = p.dump(indent=indent+2, depth=depth, flags=flags, extra_flags=extra_flags, ignore_empty=ignore_empty, errors_only=errors_only) if not tmp: continue if max and parts: parts.append(Indent(indent+2) + "====================================================") max -= 1 parts.append(tmp) if self.sections and parts: parts.insert(0, "%sParts:" % Indent(indent+1)) if parts: ret.extend(parts) if self._errors: ret.append(self.fmt_errors(indent=indent+1)) if errors_only and len(ret) == 0: return None ret.insert(0, "%s%s (%s):" % (Indent(indent), self.title, self.signature)) return "\n".join(ret) # dump @classmethod def create_from_csv(kls, csv_file): """ Master ingest routine converting the output of dft.pl (aka dft.tsv) to Parts(), and then grouping Parts() into Settings(). Returns a dict of (sig, setting). Under normal circumstances there should be exactly one setting per signature. If there are more than one, then that entry will be a list of settings with that sig, which is considered an ERROR. """ mpSigParts = collections.defaultdict(list) with open(csv_file, encoding="ISO-8859-1") as csvfile: # generator to snarf weird "\lang123' patterns from file. reader = csv.DictReader(csvfile, delimiter="\t") for i,part_dict in enumerate(reader): tmp = {} # clean up crap in the dict for k,v in part_dict.items(): if k is None: continue if v is None: print(i, k, v) v = '' v = re.sub(r'\\lang[0-9]+\s?', '', v) tmp[k] = v part = Part(tmp) # pass cleaned up dict mpSigParts[part.signature].append(part) # for # with # Now that all of the parts are grouped together # we need to analyze them into their separate settings # and (possibly) sections-within-setting, for sig,parts in mpSigParts.items(): setting = Setting(sig, parts) setting.analyze() Setting.add(sig, setting) # create_from_csv # class Setting def PrintSetting(setting, flags="*", ignore_empty=True, errors_only=False): tmp = setting.dump(flags=flags, ignore_empty=ignore_empty, errors_only=errors_only) if tmp: print(tmp) print() # PrintSetting def PrintSettings(**kwargs): # flags="pt ff errors", ignore_empty=True, errors_only=False): for (k,setting) in Setting.all(sorted=True): PrintSetting(setting, **kwargs) # PrintSettings def AnalyzeSettingDirectories(ignore_key=False): for k,setting in Setting.all(sorted=True): dirs = collections.defaultdict(list) for part in setting.all_parts(): dir = re.sub(r'[^/]*$', r'', part.fronimo_file) dirs[dir].append(part) if len(dirs) > 1: # Sometimes we have the same setting in more than 1 key. # The directory names will be of the form: # .../Dm/... vs. .../Am/... keys = collections.defaultdict(list) nokey = [] for d,parts in dirs.items(): m = re.search(r'[_/]([A-Z]b?[mM])/$', d) if not m: key = "No Key" nokey.append(parts) else: key = m.group(1) keys[key].append(parts) did_sig = False if keys: if ignore_key and not nokey: continue if not ignore_key: if not did_sig: did_sig = True print("%s (%s)\n" % (setting.title, setting.signature)) print("\tKeys") for k,list_parts in keys.items(): print("\t %s:" % k) for l in list_parts: for p in l: print("\t\t%s: %s" % (p.key, p.fronimo_file)) print() if nokey: if not did_sig: did_sig = True print("%s (%s)\n" % (setting.title, setting.signature)) print("\tNo key in filename") for parts in nokey: for p in parts: print("\t %s: %s" % (p.key, p.fronimo_file)) print() if not did_sig: print("%s (%s)\n" % (setting.title, setting.signature)) for d,ps in dirs.items(): tmp = [ "%s: %s" % (p.key, p.fronimo_file) for p in ps] print("\t%s\n\t %s" % (d, "\n\t ".join(tmp))) print() print("\n++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n") # for all settings # AnalyzeSettingDirectories def AnalyzeSettingComposers(): settings = Setting.all() i = 0 for k in sorted(settings.keys()): setting = settings[k] composers = collections.defaultdict(list) composers0 = collections.defaultdict(list) for p in setting.all_parts(): composers[p.composer].append(p) composers0[p.composer0].append(p) if len(composers) > 1 or len(composers0) > 1: if len(composers) > 1: print("Multiple Composers") for c,ps in composers.items(): tmp = [p.fronimo_file for p in ps] print("\t[%s]\n\t\t%s" % (c, "\n\t\t".join(tmp))) print() if len(composers0) > 1: print("Multiple Composer0s") for c,ps in composers0.items(): tmp = [p.fronimo_file for p in ps] print("\t[%s]\n\t\t%s" % (c, "\n\t\t".join(tmp))) print() print("Setting:") PrintSetting(setting, flags="cm c0 ff", ignore_empty=True, errors_only=False) print("\n++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n\n") # for all settings # AnalyzeSettingComposers def DoMain(): parts_file = "dft_py.tsv" #parts_file = "dft_test.tsv" music_types_file = "typemap.txt" composers_file = "compdata.tsv" instruments_file = "instrument_order.txt" # We create Insturments, Composers, and MusicTypes # first because we have foreign key refs to them # from Parts. MusicType.create_from_typemap(music_types_file) Composer.create_from_file(composers_file) Instrument.create_from_file(instruments_file) Setting.create_from_csv(parts_file) #AnalyzeSettingComposers() #AnalyzeSettingDirectories(ignore_key=False) #PrintSettings("cm c0 pt ff errors", errors_only=False) PrintSettings(flags="* errors", ignore_empty=True) if __name__ == "__main__": DoMain()