I'm trying to go through each feature in one file (1 per line) and find all matching features based on one column of that line in a second file. I have this solution, which does what I want on small files, but it's very slow on big files (my files have >20,000,000 lines). Here's a sample of the two input files.
My (slow) code:
FEATUREFILE = 'S2_STARRseq_rep1_vsControl_peaks.bed'
CONSERVATIONFILEDIR = './conservation/'
with open(str(FEATUREFILE),'r') as peakFile, open('featureConservation.td',"w+") as outfile:
for line in peakFile.readlines():
chrom = line.split('t')[0]
startPos = int(line.split('t')[1])
endPos = int(line.split('t')[2])
peakName = line.split('t')[3]
enrichVal = float(line.split('t')[4])
#Reject negative peak starts, if they exist (sometimes this can happen w/ MACS)
if startPos > 0:
with open(str(CONSERVATIONFILEDIR) + str(chrom)+'.bed','r') as conservationFile:
cumulConserv = 0.
n = 0
for conservLine in conservationFile.readlines():
position = int(conservLine.split('t')[1])
conservScore = float(conservLine.split('t')[3])
if position >= startPos and position <= endPos:
cumulConserv += conservScore
n+=1
featureConservation = cumulConserv/(n)
outfile.write(str(chrom) + 't' + str(startPos) + 't' + str(endPos) + 't' + str(peakName) + 't' + str(enrichVal) + 't' + str(featureConservation) + 'n')