forked from justinbois/bootcamp
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexample2_2.py
89 lines (69 loc) · 2.64 KB
/
example2_2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#Exercise 2.2
import os
with open('data/salmonella_spi1_region.fna','r') as f:
counter = 0
base_pairs = 'CcGgAaTtUu'
Long_chain = ''
for line in f:
counter += 1
if line[0] in base_pairs:
Long_chain += line.rstrip()
#Example 2.3
def gc_blocks(seq, block_size):
if type(block_size) != int or block_size <= 0:
RuntimeError('Block size not a positive integer.')
#Defines blocks of the sequence.
seq = seq.upper()
number_of_blocks = len(seq) // block_size
new_seq_length = number_of_blocks * block_size
shortened_seq = seq[0:new_seq_length]
#Calculate GC content for different blocks
gc_of_blocks = []
for i in range(0,new_seq_length,block_size):
gc_current_block = (shortened_seq[i:i+block_size].count('G')\
+ shortened_seq[i:i+block_size].count('C')) / block_size
gc_of_blocks += [gc_current_block]
gc_of_blocks = tuple(gc_of_blocks)
return gc_of_blocks
#Example 2.3b
def gc_map(seq, block_size, gc_thresh):
"""Checks if a block of sequence is above gc threshold."""
if type(block_size) != int or block_size <= 0:
RuntimeError('Block size not a positive integer.')
#Defines blocks of the sequence.
seq = seq.lower()
number_of_blocks = len(seq) // block_size
new_seq_length = number_of_blocks * block_size
shortened_seq = seq[0:new_seq_length]
#Calculate GC content for different blocks
threshold_sequence = ''
for i in range(0, new_seq_length, block_size):
gc_current_block = (shortened_seq[i:i+block_size].count('g')\
+ shortened_seq[i:i+block_size].count('c')) / block_size
print(gc_current_block)
if gc_current_block > gc_thresh:
threshold_sequence += shortened_seq[i:i+block_size].upper()
else:
threshold_sequence += shortened_seq[i:i+block_size]
return threshold_sequence
#Example 2.4a
def longest_orf(seq):
"""Finds longest open reading frame in DNA sequence."""
start_codons = 'ATG'
stop_codons = ['TGA','TAG','TAA']
#Look for start codon
old_orf_length = 0
for i in range(0,len(seq)):
if seq[i:i+3] in start_codons:
#search for corresponding stop codon.
for j in range(i+3,len(seq),3):
if seq[j:j+3] in stop_codons:
orf_length = j + 3 - i
#Check to see if new orf is longer than old.
if orf_length > old_orf_length:
old_orf_length = orf_length
orf_longest = seq[i:j+3]
return orf_longest
#Example 2.4c
import bioinfo_dicts.py
def dna_to_protein(seq):