Source code for pype.binfmisc

from pype.misc import xopen


[docs]class fastq(): """Fastq iterator to extract name, sequence and quality ofr each read.""" def __init__(self, f, n=-1): """ Specify the file object to iterate. :param f: fastq file :type f: File :param n: number of reads to evaluate, defaults to -1 :type n: int, optional """ self.fastq = f self.n = n self.counter = 0 def __iter__(self): return self def __next__(self): """ Read the next 4 lines of the fastq file. Implement a __next__ magic method to iterate the fastq file and return a dictionary with 'name', 'seq' 'sep' and 'qual' keys. The 'sep' line is collected for consistency, despite the fact that there may be no use for it. :raises StopIteration: Maximum umber of reads reached. :raises e: End of the file. :return: A dictionary with 'name', 'seq' and 'qual' keys. :rtype: dict """ if self.counter == self.n: raise StopIteration try: name = next(self.fastq).strip() read = next(self.fastq).strip() sep = next(self.fastq).strip() qual = next(self.fastq).strip() self.counter += 1 return { 'name': name, 'seq': read, 'sep': sep, 'qual': qual} except StopIteration as e: raise e
def parse_fastq_name_illumina_1_8(line): line = line.split(' ') line = [item.split(':') for item in line] machine_id = line[0][0] if machine_id.startswith('@'): machine_id = machine_id[1:] flowcell_id = line[0][2] lane_nr = line[0][3] mate_nr = line[1][0] index_id = line[1][3] return { 'machine_id': machine_id, 'flowcell_id': flowcell_id, 'lane': lane_nr, 'mate': mate_nr, 'index': index_id } def parse_fastq_name_illumina_1_4(line): line = line.split('#') line = [item.split(':') for item in line] machine_id = line[0][0] if machine_id.startswith('@'): machine_id = machine_id[1:] flowcell_id = None lane_nr = line[0][1] index_id, mate_nr = line[1][0].split('/') return { 'machine_id': machine_id, 'flowcell_id': flowcell_id, 'lane': lane_nr, 'mate': mate_nr, 'index': index_id } def parse_fastq_name_illumina_no_index(line): line = line.split(':') machine_id = line[0] if machine_id.startswith('@'): machine_id = machine_id[1:] flowcell_id = None lane_nr = line[1] try: index_id, mate_nr = line[4].split('/') except ValueError: mate_nr = None finally: index_id = None return { 'machine_id': machine_id, 'flowcell_id': flowcell_id, 'lane': lane_nr, 'mate': mate_nr, 'index': index_id } def fastq_name_info(fastq_file, n=50000, strict=False): reads_info = [] with xopen(fastq_file, 'rt') as fq: fastq_iter = fastq(fq, n) for item in fastq_iter: try: reads_info.append( parse_fastq_name_illumina_1_4(item['name'])) except IndexError: try: reads_info.append( parse_fastq_name_illumina_1_8(item['name'])) except IndexError: reads_info.append( parse_fastq_name_illumina_no_index(item['name'])) info = reads_info[0] if strict: all_same = True for item in reads_info: if item != info: all_same = False if all_same: return info raise Exception('Unsuccessful fastq parsing') return info