This is pretty old-school -- no databases, every filter reads in the file, then writes out to a file.
I've used functions passed as arguments to make it easy to create new filters.
I've included the four basic filters I've been using, and I've added an example of a more complex filter at the bottom which hopefully demonstrates how to write most filters you might need -- but if there's any questions, just ask. The lambda function should take as its single argument a whole row from the detailed version of the database and return a truth value -- true means it will be included in the output file, false means it will be left out.
(I'm intending to integrate checking submitters' language skill at a later date, but this is what I've got at the moment.)
Code: Select all
# Tatoeba sentence filter
import csv
def genericFilter (filterFunc, inputFile, outputFile) :
if outputFile==None :
outputFile=inputFile+".2.csv"
with open (outputFile,"w") as f_out :
with open (inputFile,"r") as f_in :
csv_in = csv.reader (f_in,delimiter="\t")
csv_out = csv.writer (f_out,delimiter="\t")
for row in csv_in :
if filterFunc(row):
csv_out.writerow(row)
def filterByLanguage (lang, inputFile="sentences_detailed.csv",outputFile=None) :
genericFilter ( lambda (row) : row[1]==lang, inputFile, outputFile )
def filterByUser (user, inputFile="sentences_detailed.csv",outputFile=None) :
genericFilter ( lambda (row) : row[3] == user, inputFile, outputFile )
def filterHasAudio (inputFile="sentences_detailed.csv",outputFile=None,audioFile="sentences_with_audio.csv") :
with open(audioFile,"r") as f_audio:
csv_audio = csv.reader (f_audio,delimiter="\t")
audio = set([ row[0] for row in csv_audio ])
genericFilter (lambda (row) : row[0] in audio, inputFile, outputFile)
def filterNoAudio (inputFile="sentences_detailed.csv",outputFile=None,audioFile="sentences_with_audio.csv") :
with open(audioFile,"r") as f_audio:
csv_audio = csv.reader (f_audio,delimiter="\t")
audio = set([ row[0] for row in csv_audio ])
genericFilter (lambda (row) : row[0] not in audio, inputFile, outputFile)
def filterByLanguageAndUser(lang, user, inputFile="sentences_detailed.csv",outputFile=None) :
genericFilter ( lambda (row) : row[1]==lang and row[3]==user, inputFile, outputFile )