5759 |
26 Sep 11 |
nicklas |
1 |
/** |
5759 |
26 Sep 11 |
nicklas |
$Id$ |
5759 |
26 Sep 11 |
nicklas |
3 |
|
5759 |
26 Sep 11 |
nicklas |
Copyright (C) 2011 Nicklas Nordborg |
5759 |
26 Sep 11 |
nicklas |
5 |
|
5759 |
26 Sep 11 |
nicklas |
This file is part of BASE - BioArray Software Environment. |
5759 |
26 Sep 11 |
nicklas |
Available at http://base.thep.lu.se/ |
5759 |
26 Sep 11 |
nicklas |
8 |
|
5759 |
26 Sep 11 |
nicklas |
BASE is free software; you can redistribute it and/or |
5759 |
26 Sep 11 |
nicklas |
modify it under the terms of the GNU General Public License |
5759 |
26 Sep 11 |
nicklas |
as published by the Free Software Foundation; either version 3 |
5759 |
26 Sep 11 |
nicklas |
of the License, or (at your option) any later version. |
5759 |
26 Sep 11 |
nicklas |
13 |
|
5759 |
26 Sep 11 |
nicklas |
BASE is distributed in the hope that it will be useful, |
5759 |
26 Sep 11 |
nicklas |
but WITHOUT ANY WARRANTY; without even the implied warranty of |
5759 |
26 Sep 11 |
nicklas |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
5759 |
26 Sep 11 |
nicklas |
GNU General Public License for more details. |
5759 |
26 Sep 11 |
nicklas |
18 |
|
5759 |
26 Sep 11 |
nicklas |
You should have received a copy of the GNU General Public License |
5759 |
26 Sep 11 |
nicklas |
along with BASE. If not, see <http://www.gnu.org/licenses/>. |
5759 |
26 Sep 11 |
nicklas |
21 |
*/ |
5759 |
26 Sep 11 |
nicklas |
22 |
package net.sf.basedb.util.gtf; |
5759 |
26 Sep 11 |
nicklas |
23 |
|
5759 |
26 Sep 11 |
nicklas |
24 |
import java.io.BufferedReader; |
5759 |
26 Sep 11 |
nicklas |
25 |
import java.io.IOException; |
5759 |
26 Sep 11 |
nicklas |
26 |
import java.io.InputStream; |
5759 |
26 Sep 11 |
nicklas |
27 |
import java.io.InputStreamReader; |
5759 |
26 Sep 11 |
nicklas |
28 |
import java.nio.charset.Charset; |
5759 |
26 Sep 11 |
nicklas |
29 |
import java.util.ArrayList; |
5759 |
26 Sep 11 |
nicklas |
30 |
import java.util.HashSet; |
5759 |
26 Sep 11 |
nicklas |
31 |
import java.util.List; |
5759 |
26 Sep 11 |
nicklas |
32 |
import java.util.Set; |
5759 |
26 Sep 11 |
nicklas |
33 |
import java.util.regex.Matcher; |
5759 |
26 Sep 11 |
nicklas |
34 |
import java.util.regex.Pattern; |
5759 |
26 Sep 11 |
nicklas |
35 |
|
5759 |
26 Sep 11 |
nicklas |
36 |
import net.sf.basedb.util.parser.FlatFileParser; |
5759 |
26 Sep 11 |
nicklas |
37 |
|
5759 |
26 Sep 11 |
nicklas |
38 |
/** |
5759 |
26 Sep 11 |
nicklas |
Input stream implementation that reads from a GTF file and converts it to |
5759 |
26 Sep 11 |
nicklas |
a simple tab-separated file with a single line of column headers. This is |
5759 |
26 Sep 11 |
nicklas |
useful since it means that we can use the regular {@link FlatFileParser} |
5759 |
26 Sep 11 |
nicklas |
and other tools for parsing the resulting stream. The first line in the |
5759 |
26 Sep 11 |
nicklas |
file is used a template line. The first 8 columns are fixed. The 9th column |
5759 |
26 Sep 11 |
nicklas |
contains attributes as key/value pairs, which are converted to additional |
5759 |
26 Sep 11 |
nicklas |
columns in the output. The GTF specification require that <code>gene_id</code> |
5759 |
26 Sep 11 |
nicklas |
and <code>transcript_id</code> are present, which means that the output will |
5759 |
26 Sep 11 |
nicklas |
contain at least 10 columns. Subsequent lines are parsed in the same way and |
5759 |
26 Sep 11 |
nicklas |
attributes are lined up with the first line. Note that any attributes |
5759 |
26 Sep 11 |
nicklas |
that are not present in the first line are skipped. The parser also has an |
5770 |
29 Sep 11 |
nicklas |
option to skip lines with a <code>transcript_id+seqname</code> that is not unique. |
5759 |
26 Sep 11 |
nicklas |
Normally, a GTF file will contain multiple entries with the same id:s, but |
5759 |
26 Sep 11 |
nicklas |
in most cases we are not interested in this when importing data to BASE. |
5759 |
26 Sep 11 |
nicklas |
This option also remove the feature, start, end, score, strand and frame |
6459 |
27 May 14 |
nicklas |
columns from the output. Lines that can't be split into at least 9 columns |
6459 |
27 May 14 |
nicklas |
(eg. comment lines starting with #) are ignored and forwarded without modification. |
5759 |
26 Sep 11 |
nicklas |
56 |
|
5759 |
26 Sep 11 |
nicklas |
@author Nicklas |
5759 |
26 Sep 11 |
nicklas |
@since 3.0 |
5759 |
26 Sep 11 |
nicklas |
@base.modified $Date$ |
5759 |
26 Sep 11 |
nicklas |
60 |
*/ |
5759 |
26 Sep 11 |
nicklas |
61 |
public class GtfInputStream |
5759 |
26 Sep 11 |
nicklas |
62 |
extends InputStream |
5759 |
26 Sep 11 |
nicklas |
63 |
{ |
5759 |
26 Sep 11 |
nicklas |
64 |
|
5759 |
26 Sep 11 |
nicklas |
65 |
private final InputStream master; |
5759 |
26 Sep 11 |
nicklas |
66 |
private final BufferedReader reader; |
5759 |
26 Sep 11 |
nicklas |
67 |
private final Charset charset; |
5759 |
26 Sep 11 |
nicklas |
68 |
private final Pattern ATTRIBUTE_PATTERN; |
5759 |
26 Sep 11 |
nicklas |
69 |
|
5759 |
26 Sep 11 |
nicklas |
// Buffer each line that we read from the GTF file |
5759 |
26 Sep 11 |
nicklas |
71 |
private byte[] buffer; |
5759 |
26 Sep 11 |
nicklas |
// The index of the next byte to return from the buffer |
5759 |
26 Sep 11 |
nicklas |
73 |
private int index; |
5759 |
26 Sep 11 |
nicklas |
74 |
|
5759 |
26 Sep 11 |
nicklas |
// Number of lines that we have read from the GTF file |
5759 |
26 Sep 11 |
nicklas |
76 |
private int lineNum; |
5759 |
26 Sep 11 |
nicklas |
77 |
|
5759 |
26 Sep 11 |
nicklas |
// List of attributes in the GTF file in the order they appear |
5759 |
26 Sep 11 |
nicklas |
// on the first line. gene_id and transcript_id are required |
5759 |
26 Sep 11 |
nicklas |
// so we remember the index they have |
5759 |
26 Sep 11 |
nicklas |
81 |
private Attribute[] attributes; |
5759 |
26 Sep 11 |
nicklas |
82 |
private int geneIdIndex; |
5759 |
26 Sep 11 |
nicklas |
83 |
private int transcriptIdIndex; |
5759 |
26 Sep 11 |
nicklas |
84 |
|
5759 |
26 Sep 11 |
nicklas |
// For storing id:s that we have seen and may be skipped |
5759 |
26 Sep 11 |
nicklas |
86 |
private final boolean skipRepeatedTranscriptIds; |
5759 |
26 Sep 11 |
nicklas |
87 |
private final Set<String> transcriptIds; |
5759 |
26 Sep 11 |
nicklas |
88 |
|
5759 |
26 Sep 11 |
nicklas |
89 |
/** |
5759 |
26 Sep 11 |
nicklas |
Create a new input stream reading from the master. |
5759 |
26 Sep 11 |
nicklas |
@param master The master input stream |
5759 |
26 Sep 11 |
nicklas |
@param charset The character set used in the file |
5759 |
26 Sep 11 |
nicklas |
@param skipRepeatedTranscriptIds TRUE to skip lines with non-unique |
5770 |
29 Sep 11 |
nicklas |
values for transcript_id+seqname |
5759 |
26 Sep 11 |
nicklas |
95 |
*/ |
5759 |
26 Sep 11 |
nicklas |
96 |
public GtfInputStream(InputStream master, String charset, boolean skipRepeatedTranscriptIds) |
5759 |
26 Sep 11 |
nicklas |
97 |
throws IOException |
5759 |
26 Sep 11 |
nicklas |
98 |
{ |
5759 |
26 Sep 11 |
nicklas |
99 |
this.master = master; |
5759 |
26 Sep 11 |
nicklas |
100 |
this.charset = Charset.forName(charset == null ? "ISO-8859-1" : charset); |
5759 |
26 Sep 11 |
nicklas |
101 |
this.skipRepeatedTranscriptIds = skipRepeatedTranscriptIds; |
5764 |
27 Sep 11 |
nicklas |
102 |
this.transcriptIds = new HashSet<String>(); |
5764 |
27 Sep 11 |
nicklas |
103 |
this.reader = new BufferedReader(new InputStreamReader(master, this.charset)); |
5759 |
26 Sep 11 |
nicklas |
104 |
this.ATTRIBUTE_PATTERN = Pattern.compile("([^ ]+) (([^\";]*)|(\"[^\"]*\"));"); |
6459 |
27 May 14 |
nicklas |
105 |
this.buffer = readMore(); |
5759 |
26 Sep 11 |
nicklas |
106 |
} |
5759 |
26 Sep 11 |
nicklas |
107 |
|
5759 |
26 Sep 11 |
nicklas |
108 |
/* |
5759 |
26 Sep 11 |
nicklas |
From the InputStream class |
5759 |
26 Sep 11 |
nicklas |
110 |
-------------------------- |
5759 |
26 Sep 11 |
nicklas |
111 |
*/ |
5759 |
26 Sep 11 |
nicklas |
112 |
@Override |
5759 |
26 Sep 11 |
nicklas |
113 |
public int read() |
5759 |
26 Sep 11 |
nicklas |
114 |
throws IOException |
5759 |
26 Sep 11 |
nicklas |
115 |
{ |
5759 |
26 Sep 11 |
nicklas |
116 |
if (buffer == null) return -1; |
5759 |
26 Sep 11 |
nicklas |
117 |
if (index >= buffer.length) |
5759 |
26 Sep 11 |
nicklas |
118 |
{ |
5759 |
26 Sep 11 |
nicklas |
// read next data line |
5759 |
26 Sep 11 |
nicklas |
120 |
readMore(); |
5759 |
26 Sep 11 |
nicklas |
121 |
index = 0; |
5759 |
26 Sep 11 |
nicklas |
122 |
if (buffer == null) return -1; |
5759 |
26 Sep 11 |
nicklas |
123 |
} |
5759 |
26 Sep 11 |
nicklas |
124 |
|
5759 |
26 Sep 11 |
nicklas |
// Important! Mask with 0xff to get values in range (0,255), NOT (-128,127) |
5759 |
26 Sep 11 |
nicklas |
126 |
return buffer[index] & 0xff; |
5759 |
26 Sep 11 |
nicklas |
127 |
} |
5759 |
26 Sep 11 |
nicklas |
128 |
|
5759 |
26 Sep 11 |
nicklas |
129 |
@Override |
5759 |
26 Sep 11 |
nicklas |
130 |
public int read(byte[] b) |
5759 |
26 Sep 11 |
nicklas |
131 |
throws IOException |
5759 |
26 Sep 11 |
nicklas |
132 |
{ |
5759 |
26 Sep 11 |
nicklas |
133 |
return read(b, 0, b.length); |
5759 |
26 Sep 11 |
nicklas |
134 |
} |
5759 |
26 Sep 11 |
nicklas |
135 |
|
5759 |
26 Sep 11 |
nicklas |
136 |
@Override |
5759 |
26 Sep 11 |
nicklas |
137 |
public int read(byte[] b, int off, int len) |
5759 |
26 Sep 11 |
nicklas |
138 |
throws IOException |
5759 |
26 Sep 11 |
nicklas |
139 |
{ |
5759 |
26 Sep 11 |
nicklas |
140 |
if (buffer == null) return -1; |
5759 |
26 Sep 11 |
nicklas |
141 |
if (index >= buffer.length) |
5759 |
26 Sep 11 |
nicklas |
142 |
{ |
5759 |
26 Sep 11 |
nicklas |
// read next data line |
5770 |
29 Sep 11 |
nicklas |
144 |
buffer = readMore(); |
5759 |
26 Sep 11 |
nicklas |
145 |
index = 0; |
5759 |
26 Sep 11 |
nicklas |
146 |
if (buffer == null) return -1; |
5759 |
26 Sep 11 |
nicklas |
147 |
} |
5759 |
26 Sep 11 |
nicklas |
148 |
if (len > buffer.length - index) |
5759 |
26 Sep 11 |
nicklas |
149 |
{ |
5759 |
26 Sep 11 |
nicklas |
150 |
len = buffer.length - index; |
5759 |
26 Sep 11 |
nicklas |
151 |
} |
5759 |
26 Sep 11 |
nicklas |
152 |
System.arraycopy(buffer, index, b, off, len); |
5759 |
26 Sep 11 |
nicklas |
153 |
index += len; |
5759 |
26 Sep 11 |
nicklas |
154 |
return len; |
5759 |
26 Sep 11 |
nicklas |
155 |
} |
5759 |
26 Sep 11 |
nicklas |
156 |
|
5759 |
26 Sep 11 |
nicklas |
157 |
@Override |
5759 |
26 Sep 11 |
nicklas |
158 |
public int available() |
5759 |
26 Sep 11 |
nicklas |
159 |
throws IOException |
5759 |
26 Sep 11 |
nicklas |
160 |
{ |
5759 |
26 Sep 11 |
nicklas |
161 |
return buffer == null ? 0 : buffer.length - index; |
5759 |
26 Sep 11 |
nicklas |
162 |
} |
5759 |
26 Sep 11 |
nicklas |
163 |
|
5759 |
26 Sep 11 |
nicklas |
164 |
@Override |
5759 |
26 Sep 11 |
nicklas |
165 |
public void close() |
5759 |
26 Sep 11 |
nicklas |
166 |
throws IOException |
5759 |
26 Sep 11 |
nicklas |
167 |
{ |
5759 |
26 Sep 11 |
nicklas |
168 |
master.close(); |
5759 |
26 Sep 11 |
nicklas |
169 |
} |
5759 |
26 Sep 11 |
nicklas |
170 |
|
5759 |
26 Sep 11 |
nicklas |
171 |
@Override |
5759 |
26 Sep 11 |
nicklas |
172 |
public boolean markSupported() |
5759 |
26 Sep 11 |
nicklas |
173 |
{ |
5759 |
26 Sep 11 |
nicklas |
174 |
return false; |
5759 |
26 Sep 11 |
nicklas |
175 |
} |
5759 |
26 Sep 11 |
nicklas |
176 |
|
5759 |
26 Sep 11 |
nicklas |
177 |
@Override |
5759 |
26 Sep 11 |
nicklas |
178 |
public synchronized void reset() |
5759 |
26 Sep 11 |
nicklas |
179 |
throws IOException |
5759 |
26 Sep 11 |
nicklas |
180 |
{ |
5759 |
26 Sep 11 |
nicklas |
181 |
throw new IOException("reset() is not supported"); |
5759 |
26 Sep 11 |
nicklas |
182 |
} |
5759 |
26 Sep 11 |
nicklas |
183 |
// ----------------------------------------- |
5759 |
26 Sep 11 |
nicklas |
184 |
|
5759 |
26 Sep 11 |
nicklas |
185 |
/** |
5764 |
27 Sep 11 |
nicklas |
Get the number of lines parsed so far. |
5764 |
27 Sep 11 |
nicklas |
187 |
*/ |
5764 |
27 Sep 11 |
nicklas |
188 |
public int getNumLines() |
5764 |
27 Sep 11 |
nicklas |
189 |
{ |
5764 |
27 Sep 11 |
nicklas |
190 |
return lineNum; |
5764 |
27 Sep 11 |
nicklas |
191 |
} |
5764 |
27 Sep 11 |
nicklas |
192 |
|
5764 |
27 Sep 11 |
nicklas |
193 |
/** |
5764 |
27 Sep 11 |
nicklas |
Get the number of unique transcript ids found so far. |
5764 |
27 Sep 11 |
nicklas |
195 |
*/ |
5764 |
27 Sep 11 |
nicklas |
196 |
public int getNumUniqueTranscriptIds() |
5764 |
27 Sep 11 |
nicklas |
197 |
{ |
5764 |
27 Sep 11 |
nicklas |
198 |
return transcriptIds.size(); |
5764 |
27 Sep 11 |
nicklas |
199 |
} |
5764 |
27 Sep 11 |
nicklas |
200 |
|
5764 |
27 Sep 11 |
nicklas |
201 |
/** |
5759 |
26 Sep 11 |
nicklas |
Read more data from the GTF file. Typically one additional |
5759 |
26 Sep 11 |
nicklas |
line is read and stored in the buffer. Do not call this method |
5759 |
26 Sep 11 |
nicklas |
unless it is certain that the existing buffer has been completely |
5759 |
26 Sep 11 |
nicklas |
read by the reader of this input stream. |
5759 |
26 Sep 11 |
nicklas |
206 |
*/ |
5770 |
29 Sep 11 |
nicklas |
207 |
private byte[] readMore() |
5759 |
26 Sep 11 |
nicklas |
208 |
throws IOException |
5759 |
26 Sep 11 |
nicklas |
209 |
{ |
5759 |
26 Sep 11 |
nicklas |
210 |
String[] line; |
6459 |
27 May 14 |
nicklas |
211 |
StringBuffer sb = new StringBuffer(); |
5759 |
26 Sep 11 |
nicklas |
212 |
do |
5759 |
26 Sep 11 |
nicklas |
213 |
{ |
5759 |
26 Sep 11 |
nicklas |
214 |
line = getNextLine(); |
5759 |
26 Sep 11 |
nicklas |
215 |
if (line == null) |
5759 |
26 Sep 11 |
nicklas |
216 |
{ |
5759 |
26 Sep 11 |
nicklas |
217 |
buffer = null; |
5770 |
29 Sep 11 |
nicklas |
218 |
return null; |
5759 |
26 Sep 11 |
nicklas |
219 |
} |
5759 |
26 Sep 11 |
nicklas |
220 |
|
5759 |
26 Sep 11 |
nicklas |
// Parse attributes in 9th column |
6459 |
27 May 14 |
nicklas |
222 |
if (line.length >= 9) |
5759 |
26 Sep 11 |
nicklas |
223 |
{ |
6459 |
27 May 14 |
nicklas |
// Generate headers if this is the first line with data |
6459 |
27 May 14 |
nicklas |
225 |
boolean generateHeaders = attributes == null; |
6459 |
27 May 14 |
nicklas |
226 |
parseAttributes(line[8]); |
6459 |
27 May 14 |
nicklas |
227 |
if (generateHeaders) |
6459 |
27 May 14 |
nicklas |
228 |
{ |
6459 |
27 May 14 |
nicklas |
229 |
if (skipRepeatedTranscriptIds) |
6459 |
27 May 14 |
nicklas |
230 |
{ |
6459 |
27 May 14 |
nicklas |
231 |
sb.append("<seqname>\t<source>"); |
6459 |
27 May 14 |
nicklas |
232 |
} |
6459 |
27 May 14 |
nicklas |
233 |
else |
6459 |
27 May 14 |
nicklas |
234 |
{ |
6459 |
27 May 14 |
nicklas |
235 |
sb.append("<seqname>\t<source>\t<feature>\t<start>\t<end>\t<score>\t<strand>\t<frame>"); |
6459 |
27 May 14 |
nicklas |
236 |
} |
6459 |
27 May 14 |
nicklas |
237 |
for (Attribute attribute : attributes) |
6459 |
27 May 14 |
nicklas |
238 |
{ |
6459 |
27 May 14 |
nicklas |
239 |
sb.append("\t<").append(attribute.key).append(">"); |
6459 |
27 May 14 |
nicklas |
240 |
} |
6459 |
27 May 14 |
nicklas |
241 |
sb.append("\n"); |
6459 |
27 May 14 |
nicklas |
242 |
} |
6459 |
27 May 14 |
nicklas |
243 |
|
6459 |
27 May 14 |
nicklas |
// Break out of the loop as soon as we see a new transcript id or if we are including repeated ids |
6459 |
27 May 14 |
nicklas |
245 |
String id = attributes[transcriptIdIndex].value + '@' + line[0]; |
6459 |
27 May 14 |
nicklas |
246 |
if (transcriptIds.add(id) || !skipRepeatedTranscriptIds) |
6459 |
27 May 14 |
nicklas |
247 |
{ |
6459 |
27 May 14 |
nicklas |
248 |
appendLine(sb, line, attributes); |
6459 |
27 May 14 |
nicklas |
249 |
break; |
6459 |
27 May 14 |
nicklas |
250 |
} |
6459 |
27 May 14 |
nicklas |
251 |
} |
6459 |
27 May 14 |
nicklas |
252 |
else |
6459 |
27 May 14 |
nicklas |
253 |
{ |
6459 |
27 May 14 |
nicklas |
// Append line as it is |
6459 |
27 May 14 |
nicklas |
255 |
appendLine(sb, line, null); |
5764 |
27 Sep 11 |
nicklas |
256 |
break; |
5759 |
26 Sep 11 |
nicklas |
257 |
} |
5759 |
26 Sep 11 |
nicklas |
258 |
} while (skipRepeatedTranscriptIds); |
5759 |
26 Sep 11 |
nicklas |
259 |
|
5759 |
26 Sep 11 |
nicklas |
// Convert to byte[] |
6459 |
27 May 14 |
nicklas |
261 |
return sb.toString().getBytes(charset); |
5759 |
26 Sep 11 |
nicklas |
262 |
} |
5759 |
26 Sep 11 |
nicklas |
263 |
|
5759 |
26 Sep 11 |
nicklas |
264 |
/** |
6459 |
27 May 14 |
nicklas |
Read the next line from the GTF file and split on tab character. |
5759 |
26 Sep 11 |
nicklas |
266 |
*/ |
5759 |
26 Sep 11 |
nicklas |
267 |
private String[] getNextLine() |
5759 |
26 Sep 11 |
nicklas |
268 |
throws IOException |
5759 |
26 Sep 11 |
nicklas |
269 |
{ |
5759 |
26 Sep 11 |
nicklas |
270 |
String line = reader.readLine(); |
5759 |
26 Sep 11 |
nicklas |
271 |
if (line == null) return null; |
5759 |
26 Sep 11 |
nicklas |
272 |
lineNum++; |
5759 |
26 Sep 11 |
nicklas |
273 |
String[] columns = line.split("\\t", 10); |
5759 |
26 Sep 11 |
nicklas |
274 |
return columns; |
5759 |
26 Sep 11 |
nicklas |
275 |
} |
5759 |
26 Sep 11 |
nicklas |
276 |
|
5759 |
26 Sep 11 |
nicklas |
277 |
/** |
5759 |
26 Sep 11 |
nicklas |
Parse attributes from the given template string. The first time |
5759 |
26 Sep 11 |
nicklas |
this method is called all attributes are accepted and their order |
5759 |
26 Sep 11 |
nicklas |
is remembered. Subsequent calls accept only values for the remembered |
5759 |
26 Sep 11 |
nicklas |
attributes. |
5759 |
26 Sep 11 |
nicklas |
282 |
*/ |
5759 |
26 Sep 11 |
nicklas |
283 |
private void parseAttributes(String template) |
5759 |
26 Sep 11 |
nicklas |
284 |
throws IOException |
5759 |
26 Sep 11 |
nicklas |
285 |
{ |
5759 |
26 Sep 11 |
nicklas |
286 |
Matcher m = ATTRIBUTE_PATTERN.matcher(template); |
5759 |
26 Sep 11 |
nicklas |
287 |
|
5759 |
26 Sep 11 |
nicklas |
288 |
if (attributes == null) |
5759 |
26 Sep 11 |
nicklas |
289 |
{ |
5759 |
26 Sep 11 |
nicklas |
// First time |
5759 |
26 Sep 11 |
nicklas |
291 |
List<Attribute> tmp = new ArrayList<Attribute>(); |
5759 |
26 Sep 11 |
nicklas |
292 |
while (m.find()) |
5759 |
26 Sep 11 |
nicklas |
293 |
{ |
5759 |
26 Sep 11 |
nicklas |
294 |
String key = m.group(1); |
5759 |
26 Sep 11 |
nicklas |
295 |
String value = m.group(2); |
5759 |
26 Sep 11 |
nicklas |
296 |
tmp.add(new Attribute(key, value)); |
5759 |
26 Sep 11 |
nicklas |
297 |
} |
5759 |
26 Sep 11 |
nicklas |
298 |
geneIdIndex = tmp.indexOf(Attribute.GENE_ID); |
5759 |
26 Sep 11 |
nicklas |
299 |
if (geneIdIndex == -1) |
5759 |
26 Sep 11 |
nicklas |
300 |
{ |
5759 |
26 Sep 11 |
nicklas |
301 |
throw new IOException("Required attribute 'gene_id' not found, at line: " + lineNum); |
5759 |
26 Sep 11 |
nicklas |
302 |
} |
5759 |
26 Sep 11 |
nicklas |
303 |
transcriptIdIndex = tmp.indexOf(Attribute.TRANSCRIPT_ID); |
5759 |
26 Sep 11 |
nicklas |
304 |
if (transcriptIdIndex == -1) |
5759 |
26 Sep 11 |
nicklas |
305 |
{ |
5759 |
26 Sep 11 |
nicklas |
306 |
throw new IOException("Required attribute 'transcript_id' not found, at line: " + lineNum); |
5759 |
26 Sep 11 |
nicklas |
307 |
} |
5759 |
26 Sep 11 |
nicklas |
308 |
attributes = tmp.toArray(new Attribute[tmp.size()]); |
5759 |
26 Sep 11 |
nicklas |
309 |
} |
5759 |
26 Sep 11 |
nicklas |
310 |
else |
5759 |
26 Sep 11 |
nicklas |
311 |
{ |
5759 |
26 Sep 11 |
nicklas |
// Subsequent times |
5759 |
26 Sep 11 |
nicklas |
313 |
int startAt = 0; |
5759 |
26 Sep 11 |
nicklas |
314 |
while (m.find()) |
5759 |
26 Sep 11 |
nicklas |
315 |
{ |
5759 |
26 Sep 11 |
nicklas |
316 |
String key = m.group(1); |
5759 |
26 Sep 11 |
nicklas |
317 |
String value = m.group(2); |
5759 |
26 Sep 11 |
nicklas |
318 |
for (int i = startAt; i < attributes.length; ++i) |
5759 |
26 Sep 11 |
nicklas |
319 |
{ |
5759 |
26 Sep 11 |
nicklas |
320 |
if (attributes[i].key.equals(key)) |
5759 |
26 Sep 11 |
nicklas |
321 |
{ |
5759 |
26 Sep 11 |
nicklas |
322 |
attributes[i].value = value; |
5759 |
26 Sep 11 |
nicklas |
// If the order of the attributes is the same on all lines |
5759 |
26 Sep 11 |
nicklas |
// we don't have to look from [0] to [length-1] for all attributes |
5759 |
26 Sep 11 |
nicklas |
325 |
if (i == startAt) startAt++; |
5759 |
26 Sep 11 |
nicklas |
326 |
break; |
5759 |
26 Sep 11 |
nicklas |
327 |
} |
5759 |
26 Sep 11 |
nicklas |
328 |
} |
5759 |
26 Sep 11 |
nicklas |
329 |
} |
5759 |
26 Sep 11 |
nicklas |
330 |
if (attributes[geneIdIndex].value == null) |
5759 |
26 Sep 11 |
nicklas |
331 |
{ |
5759 |
26 Sep 11 |
nicklas |
332 |
throw new IOException("Required attribute 'gene_id' not found, at line: " + lineNum); |
5759 |
26 Sep 11 |
nicklas |
333 |
} |
5759 |
26 Sep 11 |
nicklas |
334 |
if (attributes[transcriptIdIndex].value == null) |
5759 |
26 Sep 11 |
nicklas |
335 |
{ |
5759 |
26 Sep 11 |
nicklas |
336 |
throw new IOException("Required attribute 'transcript_id' not found, at line: " + lineNum); |
5759 |
26 Sep 11 |
nicklas |
337 |
} |
5759 |
26 Sep 11 |
nicklas |
338 |
} |
5759 |
26 Sep 11 |
nicklas |
339 |
} |
5759 |
26 Sep 11 |
nicklas |
340 |
|
5759 |
26 Sep 11 |
nicklas |
341 |
/** |
6459 |
27 May 14 |
nicklas |
Append columns to the buffer and separate each with a tab. |
6459 |
27 May 14 |
nicklas |
If attributes are given, the first 8 (or 2 if skipRepeatedTranscriptIds=true) |
6459 |
27 May 14 |
nicklas |
columns are appended, then each of the attributes are appended. |
6459 |
27 May 14 |
nicklas |
If no attributes are given, all columns are copied as they are. |
6459 |
27 May 14 |
nicklas |
346 |
|
5759 |
26 Sep 11 |
nicklas |
@param sb The buffer to append to |
5759 |
26 Sep 11 |
nicklas |
@param columns The regular columns (must be at least 8) |
5759 |
26 Sep 11 |
nicklas |
@param attr The attributes to add |
5759 |
26 Sep 11 |
nicklas |
350 |
*/ |
5759 |
26 Sep 11 |
nicklas |
351 |
private StringBuffer appendLine(StringBuffer sb, String[] columns, Attribute[] attr) |
5759 |
26 Sep 11 |
nicklas |
352 |
{ |
5759 |
26 Sep 11 |
nicklas |
// First 8 columns are copied with tab separator |
5759 |
26 Sep 11 |
nicklas |
354 |
sb.append(columns[0]); |
6459 |
27 May 14 |
nicklas |
355 |
int end = attr == null ? columns.length : (skipRepeatedTranscriptIds ? 2 : 8); |
5759 |
26 Sep 11 |
nicklas |
356 |
for (int i = 1; i < end; ++i) |
5759 |
26 Sep 11 |
nicklas |
357 |
{ |
5759 |
26 Sep 11 |
nicklas |
358 |
sb.append("\t").append(columns[i]); |
5759 |
26 Sep 11 |
nicklas |
359 |
} |
6459 |
27 May 14 |
nicklas |
360 |
if (attr != null) |
5759 |
26 Sep 11 |
nicklas |
361 |
{ |
6459 |
27 May 14 |
nicklas |
// Then follows the attributes |
6459 |
27 May 14 |
nicklas |
363 |
for (Attribute attribute : attr) |
6459 |
27 May 14 |
nicklas |
364 |
{ |
6459 |
27 May 14 |
nicklas |
365 |
sb.append("\t").append(attribute.value); |
6459 |
27 May 14 |
nicklas |
366 |
attribute.value = null; |
6459 |
27 May 14 |
nicklas |
367 |
} |
6459 |
27 May 14 |
nicklas |
368 |
if (columns.length == 10) |
6459 |
27 May 14 |
nicklas |
369 |
{ |
6459 |
27 May 14 |
nicklas |
370 |
sb.append("\t").append(columns[9]); |
6459 |
27 May 14 |
nicklas |
371 |
} |
5759 |
26 Sep 11 |
nicklas |
372 |
} |
5759 |
26 Sep 11 |
nicklas |
373 |
sb.append("\n"); |
5759 |
26 Sep 11 |
nicklas |
374 |
return sb; |
5759 |
26 Sep 11 |
nicklas |
375 |
} |
5759 |
26 Sep 11 |
nicklas |
376 |
|
5759 |
26 Sep 11 |
nicklas |
377 |
|
5759 |
26 Sep 11 |
nicklas |
378 |
static class Attribute |
5759 |
26 Sep 11 |
nicklas |
379 |
{ |
5759 |
26 Sep 11 |
nicklas |
380 |
static final Attribute GENE_ID = new Attribute("gene_id", null); |
5759 |
26 Sep 11 |
nicklas |
381 |
static final Attribute TRANSCRIPT_ID = new Attribute("transcript_id", null); |
5759 |
26 Sep 11 |
nicklas |
382 |
|
5759 |
26 Sep 11 |
nicklas |
383 |
final String key; |
5759 |
26 Sep 11 |
nicklas |
384 |
String value; |
5759 |
26 Sep 11 |
nicklas |
385 |
|
5759 |
26 Sep 11 |
nicklas |
386 |
Attribute(String key, String value) |
5759 |
26 Sep 11 |
nicklas |
387 |
{ |
5759 |
26 Sep 11 |
nicklas |
388 |
this.key = key; |
5759 |
26 Sep 11 |
nicklas |
389 |
this.value = value; |
5759 |
26 Sep 11 |
nicklas |
390 |
} |
5759 |
26 Sep 11 |
nicklas |
391 |
|
5759 |
26 Sep 11 |
nicklas |
392 |
@Override |
5759 |
26 Sep 11 |
nicklas |
393 |
public int hashCode() |
5759 |
26 Sep 11 |
nicklas |
394 |
{ |
5759 |
26 Sep 11 |
nicklas |
395 |
return key.hashCode(); |
5759 |
26 Sep 11 |
nicklas |
396 |
} |
5759 |
26 Sep 11 |
nicklas |
397 |
@Override |
5759 |
26 Sep 11 |
nicklas |
398 |
public boolean equals(Object obj) |
5759 |
26 Sep 11 |
nicklas |
399 |
{ |
5759 |
26 Sep 11 |
nicklas |
400 |
if (this == obj) return true; |
5759 |
26 Sep 11 |
nicklas |
401 |
if (obj == null) return false; |
5759 |
26 Sep 11 |
nicklas |
402 |
if (obj.getClass() != Attribute.class) return false; |
5759 |
26 Sep 11 |
nicklas |
403 |
Attribute o = (Attribute)obj; |
5759 |
26 Sep 11 |
nicklas |
404 |
return this.key.equals(o.key); |
5759 |
26 Sep 11 |
nicklas |
405 |
} |
5759 |
26 Sep 11 |
nicklas |
406 |
@Override |
5759 |
26 Sep 11 |
nicklas |
407 |
public String toString() |
5759 |
26 Sep 11 |
nicklas |
408 |
{ |
5759 |
26 Sep 11 |
nicklas |
409 |
return key + "=" + value; |
5759 |
26 Sep 11 |
nicklas |
410 |
} |
5759 |
26 Sep 11 |
nicklas |
411 |
|
5759 |
26 Sep 11 |
nicklas |
412 |
|
5759 |
26 Sep 11 |
nicklas |
413 |
} |
5759 |
26 Sep 11 |
nicklas |
414 |
|
5759 |
26 Sep 11 |
nicklas |
415 |
} |