rosalind练习题

1、统计各碱基的数量, 计算整条序列长度

seq = ''with open('./1.txt', "r") as f:for line in f:if line[0] != ">":#print(line)line = line.strip().upper()seq += line#统计碱基
print("A={0}".format(seq.count("A")))
print("T={0}".format(seq.count("T")))
print("G={0}".format(seq.count("G")))
print("C={0}".format(seq.count("C")))#统计长度
print("length={0}".format(len(seq)))

b='AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC'
print("A={0}".format(b.count("A")))
print("G={0}".format(b.count("G")))
print("C={0}".format(b.count("C")))
print("T={0}".format(b.count("T")))
print("length={0}".format(len(b)))

f = 'AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC'
counts = []
for i in ['A','G','C','T']: counts.append(f.count(i))
print (' '.join(map(str, counts)))
print ('seqlength =',len(f))

2、将DNA序列转换为RNA序列

import re
dna = 'GATGGAACTTGACTACGTAAATT'rnaseq1 = re.sub('T' , 'U', dna)
rnaseq2 = dna.replace('T', 'U')
print(rnaseq1)
print(rnaseq2)

import re
dnaseq = ''
g = open ('C:/Users/Administrator/Desktop/base1.txt', 'w')
f = open ('C:/Users/Administrator/Desktop/base.txt')
for line in f:if line[0] != '>':line = line.strip()dnaseq = dnaseq+linernaseq1 = re.sub('T' , 'U', dnaseq)
g.write(rnaseq1)
f.close()
g.close()

str = 'GATGGAACTTGACTACGTAAATT'
print(str.replace('T','U'))

3、打印出DNA互补链和反向互补链

def complement(seq):ntCom = {'A':'T', 'C':'G', 'T':'A', 'G':'C'}dna1= list(seq)dna2 = [ntCom[k] for k in dna1]dna3 = ''.join(dna2)return dna3seq = 'TGGGTTGATTCCACACCCCCGCCCGGCACCCGCGTCCGCGCCGTGGCCATCTACAAGCAGTCACAGCACATGACGGAGGTTGTGAGGCGCTGCCCCCACCATGAGCGCTGCTCAGATAGCGAT'print (complement(seq))

4、斐波那契兔子繁殖问题：
n个月后，每对兔子至少繁殖k对兔子，最后兔子总数是？

def fibonacciRabbits(n,k):F = [0,1,1]generation = 3while generation <= n:F.append(F[generation-1]+F[generation-2]*k)generation += 1return (F[n])fibonacciRabbits(5,3)

def fibonacciRabbits(n,k):if n <= 2:return (1)else:return (fibonacciRabbits(n-1,k) + fibonacciRabbits(n-2,k)*k)
print (fibonacciRabbits(5,3))

斐波那契数列

i, j = 1, 1
while i < 10000:print (i)i, j = j, i+j

# 递归
def Fibonacci_Recursion_tool(n):if n <= 0:return 0elif n == 1:return 1else:return Fibonacci_Recursion_tool(n - 1) + Fibonacci_Recursion_tool(n - 2)def Fibonacci_Recursion(n):result_list = []for i in range(1, n + 1): result_list.append(Fibonacci_Recursion_tool(i))return result_list

Fibonacci_Recursion(15)

[1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610]

5、计算GC含量最大的DNA序列

### 5. Computing GC Content ###
from operator import itemgetter
from collections import OrderedDictseqTest = OrderedDict()
gcContent = OrderedDict()with open('./5.txt', 'rt') as f:for line in f:  line = line.rstrip()if line.startswith('>'):seqName = line[1:]seqTest[seqName] = ''continueseqTest[seqName] += line.upper()for ke, val in seqTest.items():totalLength = len(val)gcNumber = val.count('G') + val.count('C')gcContent[ke] = float(gcNumber/totalLength)*100sortedGCContent = sorted(gcContent.items(), key = itemgetter(1))largeName = sortedGCContent[-1][0]
largeGCContent = sortedGCContent[-1][1]print ('most GC ratio gene is %s and it is %s ' %(largeName,largeGCContent))

# to open FASTA format sequence file:
s=open('./5.txt','r').readlines()# to create two lists, one for names, one for sequences
name_list=[]
seq_list=[]data='' # to put the sequence from several lines togetherfor line in s:line=line.strip()for i in line:if i == '>':name_list.append(line[1:])if data:seq_list.append(data)         #将每一行的的核苷酸字符串连接起来data=''                       # 合完后data 清零breakelse:line=line.upper()if all([k==k.upper() for k in line]):    #验证是不是所有的都是大写data=data+line
seq_list.append(data)                         # is there a way to include the last sequence in the for loop?
GC_list=[]
for seq in seq_list:i=0for k in seq:if k=="G" or k=='C':i+=1GC_cont=float(i)/len(seq)*100.0GC_list.append(GC_cont)m=max(GC_list)
print (name_list[GC_list.index(m)])             # to find the index of max GC
print ("{:0.6f}".format(m))                  # 保留6位小数

def parse_fasta(s):results = {}strings = s.strip().split('>')# Python split()通过指定分隔符对字符串进行切片，如果参数num 有指定值，则仅分隔 num 个子字符串for s in strings:if len(s) == 0:continue# 如果字符串长度为0，就跳出循环。parts = s.split()label = parts[0]bases = ''.join(parts[1:])results[label] = basesreturn resultsdef gc_content(s):n = len(s)m = 0for c in s:if c == 'G' or c == 'C':m += 1return 100 * (float(m) / n)if __name__ == "__main__":small_dataset = """>Rosalind_6404CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCCTCCCACTAATAATTCTGAGG>Rosalind_5959CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCTATATCCATTTGTCAGCAGACACGC>Rosalind_0808CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGACTGGGAACCTGCGGGCAGTAGGTGGAAT"""#large_dataset = open('datasets/rosalind_gc.txt').read()results = parse_fasta(small_dataset)results = dict([(k, gc_content(v)) for k, v in results.items()])# 这里iteritem()和item()功能是一样的# 前一个results输出，名称+序列，后一个results输出，名称+百分比highest_k = Nonehighest_v = 0for k, v in results.items():if v > highest_v:highest_k = khighest_v = v# 输出GC含量高的print (highest_k)print ('%f%%' % highest_v)

6、计算点突变个数

with open('./6.txt', 'rt') as f:seq = f.readlines()
seq1,seq2 = seq[0].strip(),seq[1].strip()
HD = 0for i in range(len(seq1)):if seq1[i] != seq2[i]:HD += 1print (HD)

s = 'GAGCCTACTAACGGGAT'
t = 'CATCGTAATGACGGCCT'
count = 0
for i in range(len(s)):if s[i] != t[i]:count +=1
print (count)

7、孟德尔第一定律

from scipy.misc import combindividuals = input('Number of individuals(k,m,n):')
[k, m, n] = map(int,individuals.split(','))
t = k+m+nrr = comb(n,2)/comb(t,2)
hh = comb(m,2)/comb(t,2)
hr = comb(n,1)*comb(m,1)/comb(t,2) prob = 1 - (rr+hh*1/4+hr*1/2)
print(rr, hh,hr)
print (prob)

def f(x, y, z):s = x + y + z  # the sum of populationc = s * (s - 1) / 2.0  # comb(2,s)p = 1 - (z * (z - 1) / 2 + 0.25 * y * (y - 1) / 2 + y * z * 0.5) / creturn pprint (f(2, 2, 2))

8、RNA序列翻译为蛋白质

def translate_rna(sequence):codonTable = {'AUA':'I', 'AUC':'I', 'AUU':'I', 'AUG':'M','ACA':'T', 'ACC':'T', 'ACG':'T', 'ACU':'T','AAC':'N', 'AAU':'N', 'AAA':'K', 'AAG':'K','AGC':'S', 'AGU':'S', 'AGA':'R', 'AGG':'R','CUA':'L', 'CUC':'L', 'CUG':'L', 'CUU':'L','CCA':'P', 'CCC':'P', 'CCG':'P', 'CCU':'P','CAC':'H', 'CAU':'H', 'CAA':'Q', 'CAG':'Q','CGA':'R', 'CGC':'R', 'CGG':'R', 'CGU':'R','GUA':'V', 'GUC':'V', 'GUG':'V', 'GUU':'V','GCA':'A', 'GCC':'A', 'GCG':'A', 'GCU':'A','GAC':'D', 'GAU':'D', 'GAA':'E', 'GAG':'E','GGA':'G', 'GGC':'G', 'GGG':'G', 'GGU':'G','UCA':'S', 'UCC':'S', 'UCG':'S', 'UCU':'S','UUC':'F', 'UUU':'F', 'UUA':'L', 'UUG':'L','UAC':'Y', 'UAU':'Y', 'UAA':'', 'UAG':'','UGC':'C', 'UGU':'C', 'UGA':'', 'UGG':'W',}proteinsequence = ''for n in range(0,len(sequence),3):proteinsequence += codonTable[sequence[n:n+3]]return proteinsequencese = 'AUGGCCAUGGCGCCCAGAACUGAGAUCAAUAGUACCCGUAUUAACGGGUGA' #sequence or se = open('rosalind_prot.txt').read().strip('\n') #sequence
print(translate_rna(se))

import re
def mRNA_protein(RNA_string):start_code = 'AUG'end_code = ['UAA', 'UAG', 'UGA']protein_table = {'UUU': 'F', 'CUU': 'L', 'AUU': 'I', 'GUU': 'V', \'UUC': 'F', 'CUC': 'L', 'AUC': 'I', 'GUC': 'V', \'UUA': 'L', 'CUA': 'L', 'AUA': 'I', 'GUA': 'V', \'UUG': 'L', 'CUG': 'L', 'AUG': 'M', 'GUG': 'V', \'UCU': 'S', 'CCU': 'P', 'ACU': 'T', 'GCU': 'A', \'UCC': 'S', 'CCC': 'P', 'ACC': 'T', 'GCC': 'A', \'UCA': 'S', 'CCA': 'P', 'ACA': 'T', 'GCA': 'A', \'UCG': 'S', 'CCG': 'P', 'ACG': 'T', 'GCG': 'A', \'UAU': 'Y', 'CAU': 'H', 'AAU': 'N', 'GAU': 'D', \'UAC': 'Y', 'CAC': 'H', 'AAC': 'N', 'GAC': 'D', \'UAA': 'Stop', 'CAA': 'Q', 'AAA': 'K', 'GAA': 'E', \'UAG': 'Stop', 'CAG': 'Q', 'AAG': 'K', 'GAG': 'E', \'UGU': 'C', 'CGU': 'R', 'AGU': 'S', 'GGU': 'G', \'UGC': 'C', 'CGC': 'R', 'AGC': 'S', 'GGC': 'G', \'UGA': 'Stop', 'CGA': 'R', 'AGA': 'R', 'GGA': 'G', \'UGG': 'W', 'CGG': 'R', 'AGG': 'R', 'GGG': 'G'}#找到起始密码子的位置start_sit = re.search(start_code, RNA_string)protein = ''#按阅读框匹配蛋白质for sit in range(start_sit.start(), len(RNA_string), 3):protein = protein + protein_table[RNA_string[sit:sit+3]]return proteinif __name__ == '__main__':RNA_string =  'ACGGGGAUGGCCAUGGCGCCCAGAACUGAGAUCAAUAGUACCCGUAUUAACGGGUGA'print(mRNA_protein(RNA_string))

import re
from collections import OrderedDictcodonTable = OrderedDict()
with open('rna_codon_table.txt') as f:for line in f:line = line.rstrip()lst = re.split('\s+', line)      #\s+ 匹配空格1次或无限次for i in [0, 2, 4, 6]:codonTable[lst[i]] = lst[i + 1]
rnaSeq = ''
with open('rosalind_prot.txt', 'rt') as f:for line in f:line = line.rstrip()rnaSeq += line.upper()
aminoAcids = []
i = 0
while i < len(rnaSeq):codon = rnaSeq[i:i + 3]if codonTable[codon] != 'Stop':aminoAcids.append(codonTable[codon])i += 3peptide = ''.join(aminoAcids)
print (peptide)

9、寻找DNA Motif

import re
matches = re.finditer('(?=ATAT)', 'GATATATGCATATACTT')
for match in matches:print (match.start()+1,end = '\t')

seq = 'GATATATGCATATACTT'
pattern = 'ATAT'def find_motif(seq, pattern):position = []for i in range(len(seq) - len(pattern)):if seq[i:i + len(pattern)] == pattern:position.append(str(i + 1))print ('\t'.join(position))find_motif(seq, pattern)

import re
seq='GATATATGCATATACTT'
print ([i.start()+1 for i in re.finditer('(?=ATAT)',seq)])

10、寻找共同祖先

import numpy as np
import os
from collections import Counter
fhand = open('./10.txt')
t = []
for line in fhand:if line.startswith('>'):continueelse:line = line.rstrip()line_list = list(line)t.append(line_list)
a = np.array(t)#创建一个二维数组
print(a)
L1,L2,L3,L4 = [], [], [], []
comsquence=''
for i in range(a.shape[1]):l = [x[i] for x in a] #调出二维数组的每一列L1.append(l.count('A'))L2.append(l.count('C'))L3.append(l.count('T'))L4.append(l.count('G'))comsquence=comsquence+Counter(l).most_common()[0][0]
print (comsquence)
print ('A:',L1,'\n','C:',L2,'\n','T:',L3,'\n','G:',L4)

from collections import Counter
from collections import OrderedDictfh = open('./11.txt', 'wt')rosalind = OrderedDict()
seqLength = 0
with open('./10.txt') as f:for line in f:line = line.rstrip()if line.startswith('>'):rosalindName = linerosalind[rosalindName] = ''continuerosalind[rosalindName] += lineseqLength = len(rosalind[rosalindName])   #len(ATCCAGCT)
A, C, G, T = [], [], [], []
consensusSeq = ''
for i in range(seqLength):seq = ''for k in rosalind.keys():      # rosalind.keys = Rosalind_1...Rosalind_7seq += rosalind[k][i]       # 把 Rosalind_1...Rosalind_7 相同顺序的碱基加起来A.append(seq.count('A'))C.append(seq.count('C'))G.append(seq.count('G'))T.append(seq.count('T'))counts = Counter(seq)           # 为seq计数consensusSeq += counts.most_common()[0][0] print(consensusSeq)#从多到少返回,是个list啊，只返回第一个
fh.write(consensusSeq + '\n')
fh.write('\n'.join(['A:\t' + '\t'.join(map(str, A)), 'C:\t' + '\t'.join(map(str, C)),'G:\t' + '\t'.join(map(str, G)), 'T:\t' + '\t'.join(map(str, T))]))#.join(map(str,A))  把 A=[5, 1, 0, 0, 5, 5, 0, 0] 格式转化成 A：5 1 0 0 5 5 0 0
fh.close()

12、重叠序列

from collections import OrderedDict
import redef overlap_graph(dna,n):edges = []for key1, val1 in dna:for key2, val2 in dna:if key1 != key2 and val1[-n:] == val2[0:n]:edges.append(key1 +'\t' +key2)return edgesseq = OrderedDict()with open ('./12.txt', 'r') as f:for line in f:line = line.rstrip()if line.startswith('>'):seqName = re.sub('>','',line)seq[seqName] = ''continueseq[seqName] += line.upper()fh = open('./12.1.txt', 'wt')for x in overlap_graph(dna,3):fh.write(x + '\n')
fh.close()

seq_list = []
stseq = ''
for line in open('./12.txt','r'):if line[0] == '>':if stseq != '':seq_list.append([stname, stseq])stseq = ''stname = line[1:-1]else:stseq = stseq + line.strip('\n')
seq_list.append([stname, stseq])
l = len(seq_list)for i in range(0, l):for j in range(0, i):if seq_list[i][1] == seq_list[j][1]:continueif seq_list[i][1][0:3] == seq_list[j][1][-3:]:print (seq_list[j][0], seq_list[i][0])if seq_list[i][1][-3:] == seq_list[j][1][0:3]:print (seq_list[i][0], seq_list[j][0])

13、计算后代的概率

a = int(input('AA-AA:'))
b = int(input('AA-Aa:'))
c = int(input('AA-aa:'))
d = int(input('Aa-Aa:'))
e = int(input('Aa-aa:'))
f = int(input('aa-aa:'))def fun(a,b,c,d,e,f):x1 = 1*ax2 = 1*bx3 = 1*cx4 = 0.75*dx5 = 0.5*ex6 = 0*freturn sum([x1,x2,x3,x4,x5,x6])*2print(fun(a,b,c,d,e,f))

14、发现共享Motif

s = """>Rosalind_1
GATTACA
>Rosalind_2
TAGACCA
>Rosalind_3
ATACA""".split(">")[1:]for i in range(len(s)):s[i] = s[i].replace("\n", '')while s[i][0] not in "ACGT":s[i] = s[i][1:]
# ^^^^^^^^^^^^^ all of that to format in FAST in array#Get shortest of DNA strings
index = s.index(min(s, key=len))motif = 'A'
shortest = s[index]#cycle over the DNA string letters
for i in range(len(shortest)):n = 0present = Truewhile present:#cycle inside over all other DNA strings and if it's present in there considered a motif and length gets increased by 1for each in s:if shortest[i:i+n] not in each or n>1000:present = Falsebreakif present:motif = max(shortest[i:i+n], motif, key=len)n += 1
print (motif)

def readfasta(filename, sample):fa = open('./14.txt', 'r')fo = open('./14.1.txt', 'w')res = {}rres = []ID = ''for line in fa:if line.startswith('>'):ID = line.strip('\n')res[ID] = ''else:res[ID] += line.strip('\n')for key in res.values():rres.append(key)fo.write(key + '\n')return rresdef fragement(seq_list):res = []seq = seq_list[0]for i in range(len(seq)):s_seq = seq[i:]#print s_seqfor j in range(len(s_seq)):res.append(s_seq[:(len(s_seq) - j)])#print resreturn resdef main(infile, sample):seq_list = readfasta(infile, sample)   #['TAGACCA','ATACA','GATTACA']print(seq_list)frags = fragement(seq_list)frags.sort(key=len, reverse=True)     # 从长到短排列for i in range(len(frags)):ans = []# s = 0# m+=1# print(m)# res[frags[i]] = 0for j in seq_list:r = j.count(frags[i])if r != 0:ans.append(r)if len(ans) >= len(seq_list):print(frags[i])breakmain('14.txt', 'sample.txt')

15、Independent Alleles

import itertools
def f(k,n):p = []child_num = 2**kfor i in range(n):p.append(len(list(itertools.combinations([x for x in range(child_num)],i)))*(0.25**i)*(0.75**(child_num-i)))# combinations('ABCD', 2)       AB AC AD BC BD CDreturn 1-sum(p)print (f(5,8))

16、Finding a Protein Motif

import urllib.request
import re
list = ['A2Z669','B5ZC00','P07204_TRBM_HUMAN','P20840_SAG1_YEAST']for one in list:name = one.strip('\n')url = 'http://www.uniprot.org/uniprot/'+name+'.fasta'req = urllib.request.Request(url)response = urllib.request.urlopen(req)   #在python3.3里面，用urllib.request代替urllib2the_page = response.read()start = the_page.decode().find('\nM')    #str→bytes：encode()方法。str通过encode()方法可以转换为bytes。# bytes→str：decode()方法。如果我们从网络或磁盘上读取了字节流，那么读到的数据就是bytes。#要把bytes变为str，就需要用decode()方法。seq = the_page[start+1:].decode().replace('\n','')seq = ' '+seqregex = re.compile(r'N(?=[^P][ST][^P])')index = 0out = []'''out = [m.start() for m in re.finditer(regex, seq)]'''index = 0while(index<len(seq)):index += 1if re.search(regex,seq[index:]) == None:break#print S[index:]if re.match(regex,seq[index:]) != None:out.append(index)if out != []:print (name)print (' '.join([ str(i) for i in out]))

rosalind练习题相关推荐

2016计算机二级java_2016计算机二级JAVA练习题及答案
2016计算机二级JAVA练习题及答案 21.下列选项中,不能输出100个整数的.是( ). A.for(int i=0;i<100;i++) System.out.println(i); B. ...
C语言语句单选题,C语言练习题
C语言练习题 C一个switch语句总是可以被一系列ifelse语句替换 D switch语句的测试表达式可以是任何类型 E当执行break语句时程序将停止执行 20在C语言中,在int num[5] ...
word2003计算机应用考试,2017职称计算机考试Word2003操作练习题
2017职称计算机考试Word2003操作练习题实验操作能力是计算机考试考查的一项基本能力,下面是小编给大家提供的职称计算机考试Word2003操作练习题,大家可以参考练习,更多习题练习请关注应届毕 ...
Python-100 练习题 01 列表推导式
最近打算好好练习下 python,因此找到一个练习题网站,打算每周练习 3-5 题吧. www.runoob.com/python/pyth- 另外,这个网站其实也还有 Python 的教程,从基础到 ...
MySQL练习题：常用函数
1. 以首字母大写,其他字母小写的方式显示所有员工的姓名 2. 将员工的职位用小写字母显示 3. 显示员工姓名超过5个字符的员工名 4. 用#来填充员工职位job的结尾处,按10个字符长度输出. 5. ...
python基础练习题
[练习题1]实现一个整数加法计算器如 content = input(">>> ") # 5+9 , 6+4 count=0 while 1:content=in ...
MySql练习题参考答案
表结构: /*Navicat Premium Data TransferSource Server : localhostSource Server Type : MySQLSource Server ...
第六章练习题和知识面扩充
作业题: 1. 自动获取IP地址的命令是什么?您知道在什么情况下,您的Linux才可以自动获取IP地址? 2. 远程连接Linux服务器,需要Linux服务器开启sshd服务,那么sshd服务默认监听 ...
暑期集训5:并查集线段树练习题G: HDU - 1754
2018学校暑期集训第五天--并查集线段树练习题G -- HDU - 1754 I Hate It 很多学校流行一种比较的习惯.老师们很喜欢询问,从某某到某某当中,分数最高的是多少. 这让 ...

rosalind练习题

rosalind练习题相关推荐

最新文章

热门文章