#!/usr/pkg/bin/python """packgene.py -- IFF format for standardization of 2bit gene compression... eventual goal is to merge both competing formats into one... IFF format data from http://en.wikipedia.org/wiki/Interchange_File_Format """ Copyright = """ packgene -- create compressed gene data in standard IFF format Copyright (C) 2005 John Comeau This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. """ errormessage = "Not all needed libraries found, upgrade or check path: " try: True # not defined in older Python releases except: True, False = 1, 0 try: import sys, os, types, re, pwd sys.path.append(os.path.join(pwd.getpwuid(os.geteuid())[5], 'lib', 'python')) errormessage = errormessage + repr(sys.path) from com.jcomeau import gpl, jclicense except: try: sys.stderr.write("%s\n" % errormessage) except: print errormessage raise # get name this program was called as self = os.path.split(sys.argv[0])[1] command = os.path.splitext(self)[0] # chop any suffix (extension) # now get name we gave it when we wrote it originalself = re.compile('[0-9A-Za-z]+').search(Copyright).group() # globals and routines that should be in every program # (yes, you could import them, but there are problems in that approach too) def DebugPrint(*whatever): return False # defined instead by pytest module, use that for debugging def join(*args): "for pythons without str.join" string, array = args if type(array) == types.StringType: array = eval(array) if hasattr(str, 'join'): return string.join(array) else: joined = '' for index in range(0, len(array)): joined = joined + array[index] if index != (len(array) - 1): joined = joined + string return joined def split(*args): "for pythons without str.split" string, string_to_split = args if not len(string): string = None if hasattr('str', 'split'): return string_to_split.split(string) else: return re.compile(re.escape(string)).split(string_to_split) # other globals, specific to this program import struct LONGFORM = bool(os.getenv('LONGFORM')) or False LINELENGTH = 50 packstrings = [ 'GATCgatcNnMR', # the one I used originally in 2bit.c 'TCAGtcagNnMR', # similar but using the more widely-accepted order 'TCAGtcagNn-URYKM', # using codes from wikipedia.org FASTA_format entry 'UCAGucagNn-TRYKM', # with Uracil code for RNA # any other group of 16 characters can be used # make sure to put most-used codes in front (to left) ] filetype = [ # not the same as group type ID, this identifies the format '2BIT', # combined with VERS indicates the format specifics ] iff = [ # all true IFF files must begin with one of these 'CAT ', # collection of nested chunks with no special semantics 'FORM', # record structure followed by record fields 'LIST', # factoring structure followed by PROPerty chunks and nested groups # chunks used in many different IFF files 'AUTH', # author of file 'ANNO', # name of program that created the file 'BODY', # file data 'NAME', # name of work in file 'PROP', # property chunk 'VERS', # file version number '(c) ', # copyright data # specific to this application 'CODE', # the packstring, usually 12 or 16 bytes long 'PACK', # type of compression used 'SEQ ', # sequence data chunk 'SIZE', # size of uncompressed FASTA data ] longiff = [ # my proposed 64-bit IFF format # all long IFF files must begin with one of these 'LONGCAT ', # collection of nested chunks with no special semantics 'LONGFORM', # record structure followed by record fields 'LONGLIST', # followed by PROPerty chunks and nested groups # used in many different IFF files 'AUTHOR ', # author of file 'ANNOTATE', # name of program that created the file 'BODY ', # file data 'NAME ', # name of work in file 'PROPERTY', # property chunk 'VERSION ', # file version number '(c) ', # copyright data # specific to this application 'PACKCODE', # the packstring, usually 12 or 16 bytes long 'PACKTYPE', # type of compression used 'SEQUENCE', # sequence data chunk 'ORIGSIZE', # size of uncompressed FASTA data ] namesplit = re.compile('[\W]+') def die(*args): """issue error message and croak""" sys.stderr.write('%s\n' % args[0]) sys.exit(1) def packgene(*args): """pack FASTA file into 2bit IFF format file""" if LONGFORM: headers = dict(map(None, iff, longiff)) else: headers = dict(map(None, iff, iff)) DebugPrint(headers) if len(args) < 2: die('Usage: %s INFILE [...] OUTFILE' % self) files = list(args) outfile = files.pop() output = None if outfile == '-': output = sys.stdout elif not outfile.startswith('*.'): output = open(outfile, 'wb') for infile in files: name = '' input = open(infile) for line in input.readlines(): if line.startswith('>'): name = namesplit.split(line[1:])[0] DebugPrint(name, line, output) def packlength(*args): """pack number into big-endian 32-bit int or 64-bit long""" length = long(args[0]) if LONGFORM: format = '>Q' else: format = '>L' return struct.pack(format, length) def unpacklength(*args): """unpack length number from IFF file""" if LONGFORM: format = '>Q' else: format = '>L' return struct.unpack(format, args[0]) def iffscan(*args): """dump out (intelligently?) any IFF file""" for file in args: input = open(file, 'rb') state = 'fetch_groupid' while not input.closed: id = input.read(4) if state == 'fetch_groupid': if not id in ['FORM', 'LIST', 'CAT ']: sys.stderr.write('%s not an IFF file\n' % file) input.close() else: state = 'fetch_filelength' if __name__ == '__main__': function = command; args = sys.argv[1:] # default case if command == originalself: try: if len(args) and eval('type(%s) == types.FunctionType' % args[0]): function = sys.argv[1]; args = sys.argv[2:] except: pass print eval('%s%s' % (function, repr(tuple(args)))) or '' else: pass