Solexa Eland Export format to WIG
Here is a python script which converts from Eland export output to a wig file:
#!/n/site/inst/Linux-i686/sys/bin/python
#
#
import os, sys, random
# if no args print usage message and quit
if len(sys.argv) < 3:
print 'usage: program chrlist elandfilename'
sys.exit(0)
# create output file name
inputfile = sys.argv[2].rpartition('/')[2]
if inputfile.endswith('.txt'):
intuple = inputfile.rpartition('.')
outfile = intuple[0] + ".wig"
else:
outfile = inputfile + ".wig"
# get chromosome ids, UCSC names, sizes
f=open(sys.argv[1], 'r')
Chrs = []
chrnames = {}
chrsize = {}
for line in f:
# skip comment lines
if(line.startswith("#")):
continue
# remove newline, split by tab
(chr,size,fa) = line.rstrip('\n').split('\t')
Chrs.append(fa)
chrnames[fa] = chr
chrsize[fa] = size
f.close()
print "Chromosomes to search for:"
for c in Chrs[:]:
print c + " "
print chrnames[c]
# print len(Chrs)
# create filenames
# and open a bunch of files
ChrFnames = []
Fps = {}
i = 0
while i < len(Chrs):
newFileName = Chrs[i] + "." + str(random.randint(1,1000)) + ".txt"
Fps[Chrs[i]] = open(newFileName,'w')
ChrFnames.append(newFileName)
i = i + 1
# open input eland file
f=open(sys.argv[2], 'r')
# remember the begining position of file
fbegin = f.tell()
for c in Chrs[:]:
f.seek(fbegin)
# declare empty dictionary
bases = {}
print >>sys.stderr, "chromosome: " + c
nlines = 0
for line in f:
nlines += 1
L1 = line.split('\t')
filtercode = L1[21].rstrip('\n')
# skip reads that didn't pass filtering
if filtercode == "N":
continue
# 10 is chr, 12 is base, 13 is strand
if(L1[10] == c):
# filter out U reads with more than 2 mm
myset = ["A", "C", "G", "T", "N"]
ctotal = 0
for s in myset:
mycount = L1[14].count(s)
ctotal += mycount
if(ctotal > 2):
# too many mm, skip to next sequence
continue
start = int(L1[12])
end = start + 36
if(end > int(chrsize[c])):
end = int(chrsize[c])
for i in range(start, end):
if(i in bases):
bases[i] += 1
else:
bases[i] = 1
# give some feedback
if(not nlines % 10000):
print >>sys.stderr, nlines
# spit out results
print >>Fps[c], "variableStep chrom=" + chrnames[c]
# sort the dict and print it to file
for mybase in sorted(bases.keys()):
print >>Fps[c], "%s\t%d" % (mybase, bases[mybase])
# close file
Fps[c].close()
f.close()
# create an output file to concatenate all the results
# might want to test for presence of outfile, and if exists modify with random number
# newFileName = "output" + str(random.randint(1,1000)) + ".wig"
print >>sys.stderr, "creating output file: " + outfile
fhout = open(outfile, 'w')
# open each temp chr file
# write results to output file
# remove temp file
for fname in ChrFnames[:]:
f = open(fname,'r')
for line in f:
fhout.write(line)
f.close()
os.remove(fname)
fhout.close()
