progName="GedExplore_v04.py"
#by David@ColeCanada.com
#on 2023GJul10
#Purpose to explore a GEDCOM
# by
# 1 loading the keys into a Python dictionary
# 2 creating a ged family database
#   putting all info into a Python Dictionary
#   using unique keys
# 3 display the metrics
# v03 will use "try" when reading the ged file
#
# TODO
# 0. validate FileName
#    ignore records preceding "1 SCHEMA"
# 1. get key between @ and @
# 2. parse the NAME correctly, allowing for mistakes
#      given/surname/title
#      both give & surname must not be null
# 3. find refs to missing INDI
# 4. find FAMs with no parent(s)
# 5. during second pass
#      ensure FAMS match and FAMC match
#      count INDIs and FAMs
#      gather BIRT, DEAT, MARR, DATE, PLAC, SEX, 
#      ensure that each record has been used
# Goal: see GEDExplore_plan.txt
# Issue 1 Encountered utf-8 error in "ColesOfDevonDeLeg15L10.ged"
#         after record 674
#         So I used "lastGood=674"
#         when lastGood==10 it will list all recN
#         but when I copied bad file to "ColesOfDevo.ged", it worked fine
#              so this must be a data record in a file stored on microsd. Doh!
#         I believe that these 2 files "should be" identical. Doh!
print(progName)
#### Control Parameters
displayOnly=10       #normally 10
FILEname="not stated in header"
prefixList = []
################################################################
#gedFile=input("gedFile name")
#gedFile="COLEDES4.GED" #no utf=8 error
#lastGood=10
#gedFile="SmithEg.ged" #no utf=8 error
#lastGood=10
#gedFile="davidcole3.ged" #utf-8 error after nRec 10924
#lastGood=10925
#gedFile="ColesOfDevon.ged"  #no utf=8 error
#lastGood=10
#gedFile="ColesOfDevonDeLeg15L10.ged" #error after nRec 674
#lastGood=674
#gedFile="JohnsonSampleTree_asof_2022BFeb08.ged"  #no utf=8 error
#lastGood=10
#gedFile="Cole21CMar22_asof_22DApr06.ged"  #error after 15983
#lastGood=15983
#gedFile="Fam1.GED"  #no utf=8 error
#lastGood=10
gedFile="Cole21CMar22_asof_22DApr06_X15984.ged"  #error after 15983
#lastGood=15983
#gedFile="Fam1.GED"
#gedFile="ColeDavid2021LDec14_deFS.ged"
#gedFile="ColeDavid2021LDec14_deFS_viaGeany.ged"
#gedFile="ColeDavid2021LDec14_deFS_202viaGeany.ged"
#gedFile="ColeDavid2021LDec14_deFS_998viaGeany.ged"
g=open(gedFile)
nRec=0
gedDict={}
gedHeadDict={}
errList=[]
isHead=False
isPrevKey=False
famList={"1 HUSB","1 WIFE","1 CHIL"}
#print(famList)
#for line in g:
errCnt=0
firstUTF_recN=0
prevRec=""
# isHdrProcessed is True only after the header has been fully processed
isHdrProcessed=False
#print("at 69 isHdrProcessed:", isHdrProcessed)
while True :
    nRec=nRec+1
    strNRec=str(nRec)
    #print("strNRec[3:-1]:",strNRec)
    lenNRec=len(strNRec)
    suffixN=lenNRec-4
    #print("strNRec[suffixN:lenNRec]:",strNRec[suffixN:lenNRec])
    rhDigits=strNRec[suffixN:lenNRec]
    if rhDigits=="0000" : print(nRec)
    #input("?")
    #if strNRec[3:-1]=="
    #print("at 70")
    try:
        line = g.readline()
    except:
        #print("at 73")
        errCnt+=1
        if errCnt==1 :
            firstUTF_recN=nRec
        #if end
        line = "bad+utf-8 error" +"\n"
        #print("Error reading nRec:",nRec," errCnt:",errCnt)
        #print("utf-8 err after:","nRec:",(nRec-1),"prevRec:",prevRec, end="")
        utf8Line="utf-8 err after:"+"nRec:"+str((nRec-1))+",prevRec:"+prevRec.strip()
        errList.append(utf8Line)
        #print("utf8Line:",utf8Line)
    finally:
        prevRec=line
        # at eof, break out out while loop
        if line == '' : break
        #line=g.read
        #if lastGood==10 : print(nRec)
        #if nRec>=(lastGood-5) and nRec<lastGood :
        #print(nRec, line)
        #print("at 93")
        if nRec<9555000:
            if line[0:7]=="1 FILE ":
                #input("at 97 FILEname")
                #print("internal file name:")
                #print(line, end="")
                FILEname=line.strip()
                #print()
            #if line[0:7]=="1 FILE " end
            #print("at 96 isHdrProcessed:", isHdrProcessed)
            #print("line[0:6]:"+line[0:6]+":")
            if line[0:6]=="0 HEAD" :
                #print("processing 0 HEAD")
                isHead=True
                isHdrProcessed=False  #not yet fully processed
            #if line[0:6]=="0 HEAD" end
            #print("at 103 isHdrProcessed:", isHdrProcessed)
            if not isHdrProcessed :
                #input("at 105 not isHdrProcessed")
                #process this header record
                #print("line[0:2]:"+line[0:2]+":")
                #print("line[0:6]:"+line[0:6]+":")
                if line[0:4]=="0 @I" and not (line[0:6]=="0 HEAD"):
                    #print("at 109 line:",line)
                    #stop processing header records
                    isHead=False
                    #print("at 113 set isHead to False")
                    #print("at 112")
                    #print("line:",line)
                else:
                    addLine="Header Record:"+line.strip()
                    prefixList.append(addLine)
                    #print("Header Record:"+line, end="")  #show this Hdr record (also 0 HEAD)
                    isHdrProcessed=False          # not fully processed yet
                #if line[0:2]=="0 " end
            #if not isProccessed end
            if not isHead :
                #input("after Head")
                #print("again1 line:"+line)
                #process this non-header record
                #print(nRec,line, end="")
                #print(line[0:8])
                #print("***************************"+line[0:7])
                if line[0:7]=="1 FILE ":
                    #input("at 129 FILEname")
                    print("internal file name:")
                    #print(line, end="")
                    FILEname=line.strip()
                    print()
                #if line[0:7]=="1 FILE " end
                #print("again2 line:"+line)                
                if line[0:3]=="0 @":
                    isPrevKey=True
                    prevLine=line
                    isHdrProcessed=True
                    #print("at 137 setting True, isHdrProcessed:",isHdrProcessed)
                else:
                    if isPrevKey==True:
                        #print(prevLine[3:4])
                        if prevLine[3:4]=="I":
                            gedDict[prevLine[2:8]]=line[7:-1]+"\n"
                            isPrevKey=False
                        #if end
                        if prevLine[3:4]=="F":
                            for f in famList:
                                famEntity=f
                                #print("|"+line[0:6]+"|")
                                if line[0:6]==famEntity:
                                    gedDict[prevLine[2:8]+famEntity]=line
                                #if end
                            #for end
                        #if end
                        # only put Source definitions in the eDict dictionary
                        if prevLine[3:4]=="S" :
                            #gedDict[prevLine[2:-1]]=str(nRec)+"\n"
                            gedDict[prevLine[2:-1]]=line[7:-1]+"\n"
                            isPrevKey=False
                        #if end
                #if line[0:3]=="0 @" end
                #print("at 161 isHdrProcessed;",isHdrProcessed)
            #if not isHead end
            #isHdrProcessed=True
            #print("at 166 isHdrProcessed;",isHdrProcessed)
            #print("nRec:",nRec)
        #if nRec<9555000 end
        #print("at 167")
    #finally end
    #print("at 169")
#while True end
#print("at 171")
    
print("end of GedExplore Analysis");print()
prefixIcnt=0
prefixFcnt=0
prefixScnt=0


for prefix in ("@I","@F","@S"):
    displayCnt=0
    prefix0Cnt=0
    for a in gedDict:
        #print(":",a[0:2],":")
        if a[0:2]==prefix :
            #test=input("")
            displayCnt+=1
            if displayCnt<=displayOnly :
                addLine=a+","+gedDict[a].strip()
                prefixList.append(addLine)
            #if end
            if prefix=="@I" : prefixIcnt+=1
            if prefix=="@F" : prefixFcnt+=1
            if prefix=="@S" : prefixScnt+=1
        #if end
    #for end
    #print(gedDict)
#for end
            
len_gedDict=len(gedDict)
             
print()
print("GedExplore Summary:")
print("-------------------")
print("gedFile:",gedFile)
print("FILEname:",FILEname)
print("record Count:",nRec)
print("len_gedDict:",len_gedDict)
print("Icnt:",prefixIcnt)
print("Fcnt:",prefixFcnt)
print("Scnt:",prefixScnt)
print("errCnt:",errCnt)
print("first UTF-8 err_recN:",firstUTF_recN)


strIn=input("Hit Enter to suppress dump of Lists:")
if strIn!="":
    print()
    print("Lists: headers, indexes & errors:")
    print("---------------------------------")
    #dump out the lists
    for strL in prefixList:
        print(strL)
    #for end

    cntN=0
    for strL in errList:
        cntN+=1
        if cntN==1: print("errList:")
        if cntN<10: print("strL:",strL)
    #for end
#if end    


g.close()
print()

print("end of: "+progName)