svndigest - svndigest

src/core/net/sf/basedb/util/importer/spotdata/FirstPassSectionSpotsParser.java

: Code
: Comments
: Other

Rev	Date	Author	Line
5093	10 Sep 09	nicklas	1	/**
5093	10 Sep 09	nicklas	2	$Id$
5093	10 Sep 09	nicklas	3
5093	10 Sep 09	nicklas	4	Copyright (C) 2009 Nicklas Nordborg
5093	10 Sep 09	nicklas	5
5093	10 Sep 09	nicklas	6	This file is part of BASE - BioArray Software Environment.
5093	10 Sep 09	nicklas	7	Available at http://base.thep.lu.se/
5093	10 Sep 09	nicklas	8
5093	10 Sep 09	nicklas	9	BASE is free software; you can redistribute it and/or
5093	10 Sep 09	nicklas	10	modify it under the terms of the GNU General Public License
5093	10 Sep 09	nicklas	11	as published by the Free Software Foundation; either version 3
5093	10 Sep 09	nicklas	12	of the License, or (at your option) any later version.
5093	10 Sep 09	nicklas	13
5093	10 Sep 09	nicklas	14	BASE is distributed in the hope that it will be useful,
5093	10 Sep 09	nicklas	15	but WITHOUT ANY WARRANTY; without even the implied warranty of
5093	10 Sep 09	nicklas	16	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
5093	10 Sep 09	nicklas	17	GNU General Public License for more details.
5093	10 Sep 09	nicklas	18
5093	10 Sep 09	nicklas	19	You should have received a copy of the GNU General Public License
5093	10 Sep 09	nicklas	20	along with BASE. If not, see <http://www.gnu.org/licenses/>.
5093	10 Sep 09	nicklas	21	*/
5093	10 Sep 09	nicklas	22	package net.sf.basedb.util.importer.spotdata;
5093	10 Sep 09	nicklas	23
5093	10 Sep 09	nicklas	24	import java.io.IOException;
5093	10 Sep 09	nicklas	25	import java.util.ArrayList;
5093	10 Sep 09	nicklas	26	import java.util.Arrays;
5093	10 Sep 09	nicklas	27	import java.util.LinkedList;
5093	10 Sep 09	nicklas	28	import java.util.List;
5093	10 Sep 09	nicklas	29
5093	10 Sep 09	nicklas	30	import net.sf.basedb.core.BaseException;
5093	10 Sep 09	nicklas	31	import net.sf.basedb.core.BioAssaySet;
5093	10 Sep 09	nicklas	32	import net.sf.basedb.core.DbControl;
5405	10 Sep 10	nicklas	33	import net.sf.basedb.core.signal.ThreadSignalHandler;
5093	10 Sep 09	nicklas	34	import net.sf.basedb.util.Values;
5093	10 Sep 09	nicklas	35	import net.sf.basedb.util.basefile.BaseFileParser;
5093	10 Sep 09	nicklas	36	import net.sf.basedb.util.basefile.BaseFileSectionParser;
5093	10 Sep 09	nicklas	37	import net.sf.basedb.util.importer.spotdata.BaseFileInfo.ChildBioAssay;
5093	10 Sep 09	nicklas	38	import net.sf.basedb.util.importer.spotdata.BaseFileInfo.SpotSectionInfo;
5093	10 Sep 09	nicklas	39	import net.sf.basedb.util.parser.FlatFileParser;
5093	10 Sep 09	nicklas	40
5093	10 Sep 09	nicklas	41	/**
5093	10 Sep 09	nicklas	42	Parses a 'spots' section of a BASEfile with bioassay set spot
5093	10 Sep 09	nicklas	43	data. This section is optional but can also appear more than
5093	10 Sep 09	nicklas	44	once (eg. serial BASEfile). The section must have an 'assays'
5093	10 Sep 09	nicklas	45	header that contains the id:s of the bioassays that have spot
5093	10 Sep 09	nicklas	46	data in this section. The id values should either reference an
5093	10 Sep 09	nicklas	47	id from the 'assays' section or the id of an existing bioassay
5093	10 Sep 09	nicklas	48	in the database.
5093	10 Sep 09	nicklas	49	<p>
5093	10 Sep 09	nicklas	50
5096	14 Sep 09	nicklas	51	The section must also contain a 'columns' header that includes at
5096	14 Sep 09	nicklas	52	least 'position', 'reporter' and 'assayData' columns. The 'assayData'
5096	14 Sep 09	nicklas	53	is a meta column that expands to the
5093	10 Sep 09	nicklas	54	columns defined by the required 'assayFields' header. This
5093	10 Sep 09	nicklas	55	header must define as many 'intensityN' columns as there are
5093	10 Sep 09	nicklas	56	channels in the experiment. For two-channel data, it may instead
5093	10 Sep 09	nicklas	57	optionally use M (l2ratio1_2) and A (l10intgmean1_2). Columns with extra
5093	10 Sep 09	nicklas	58	values can be defined by including a 'setExtraFloats' header.
5093	10 Sep 09	nicklas	59	This header just enumerates the extra columns. A macthing column
5093	10 Sep 09	nicklas	60	must also be presen in the 'assayFields' header.
5093	10 Sep 09	nicklas	61	<p>
5096	14 Sep 09	nicklas	62	NOTE! Column names can be redefined by calling
5096	14 Sep 09	nicklas	63	{@link BaseFileParser#setRedefinedColumnName(String, String, String)}.
5096	14 Sep 09	nicklas	64	<p>
5093	10 Sep 09	nicklas	65	Here is an example of the headers and the first data line of a matrix
5093	10 Sep 09	nicklas	66	BASEfile which has two two-channel assays and one extra value column.
5093	10 Sep 09	nicklas	67
5093	10 Sep 09	nicklas	68	<pre class="code">
5093	10 Sep 09	nicklas	69	section spots
5093	10 Sep 09	nicklas	70	assays 101 102
5093	10 Sep 09	nicklas	71	columns position reporter assayData
5093	10 Sep 09	nicklas	72	assayFields intensity1 intensity2 extra1
5093	10 Sep 09	nicklas	73	setExtraFloats extra1
5093	10 Sep 09	nicklas	74	%
5093	10 Sep 09	nicklas	75	1 1 1.0 2.0 0.5 2.0 3.0 0.2
5093	10 Sep 09	nicklas	76	</pre>
5093	10 Sep 09	nicklas	77
5093	10 Sep 09	nicklas	78	The first pass will check the headers and extract position/reporter
5093	10 Sep 09	nicklas	79	mappings. This is important because we need to know if the child
5093	10 Sep 09	nicklas	80	bioassay set has the same or a different position/reporter mapping
5093	10 Sep 09	nicklas	81	than the parent bioassay set.
5093	10 Sep 09	nicklas	82
5093	10 Sep 09	nicklas	83	@author Nicklas
5093	10 Sep 09	nicklas	84	@version 2.14
5093	10 Sep 09	nicklas	85	@base.modified $Date$
5093	10 Sep 09	nicklas	86	*/
5093	10 Sep 09	nicklas	87	public class FirstPassSectionSpotsParser
5093	10 Sep 09	nicklas	88	implements BaseFileSectionParser
5093	10 Sep 09	nicklas	89	{
5093	10 Sep 09	nicklas	90
5096	14 Sep 09	nicklas	91	/**
5096	14 Sep 09	nicklas	92	The section of the BASEfile this parser can parse, eg. 'spots'.
5096	14 Sep 09	nicklas	93	*/
5096	14 Sep 09	nicklas	94	public static final String SECTION = "spots";
5096	14 Sep 09	nicklas	95
5093	10 Sep 09	nicklas	96	private final DbControl dc;
5093	10 Sep 09	nicklas	97	private final BaseFileInfo info;
5093	10 Sep 09	nicklas	98	private final BioAssaySet parent;
5093	10 Sep 09	nicklas	99	private final String totalBytes;
5093	10 Sep 09	nicklas	100	private final long progressReportInterval;
5093	10 Sep 09	nicklas	101	private long nextProgressReport;
5093	10 Sep 09	nicklas	102
5093	10 Sep 09	nicklas	103	private int sectionCount;
5093	10 Sep 09	nicklas	104	private boolean mapZeroToNull;
5093	10 Sep 09	nicklas	105
5093	10 Sep 09	nicklas	106	private List<SpotIntensityParser> spotIntensityParsers;
5093	10 Sep 09	nicklas	107
5093	10 Sep 09	nicklas	108	/**
5093	10 Sep 09	nicklas	109	Creates a new 'section spot' parser for the first pass. This
5093	10 Sep 09	nicklas	110	parser will extract header information and position/reporter
5093	10 Sep 09	nicklas	111	mapping information.
5093	10 Sep 09	nicklas	112
5093	10 Sep 09	nicklas	113	@param dc A DbControl to use for database access
5093	10 Sep 09	nicklas	114	@param info Information about the file we are parsing
5093	10 Sep 09	nicklas	115	@param parent The parent bioassay set
5093	10 Sep 09	nicklas	116	*/
5093	10 Sep 09	nicklas	117	public FirstPassSectionSpotsParser(DbControl dc, BaseFileInfo info, BioAssaySet parent)
5093	10 Sep 09	nicklas	118	{
5093	10 Sep 09	nicklas	119	this.dc = dc;
5093	10 Sep 09	nicklas	120	this.info = info;
5093	10 Sep 09	nicklas	121	this.parent = parent;
5093	10 Sep 09	nicklas	122	this.progressReportInterval = info.getSize() / 100;
5093	10 Sep 09	nicklas	123	this.nextProgressReport = progressReportInterval;
5093	10 Sep 09	nicklas	124	this.totalBytes = Values.formatBytes(info.getSize());
5093	10 Sep 09	nicklas	125	int channels = parent.getRawDataType().getChannels();
5093	10 Sep 09	nicklas	126	if (channels == 2) addSpotIntensityParser(new MAParser());
5093	10 Sep 09	nicklas	127	addSpotIntensityParser(new GenericIntensityParser(channels));
5093	10 Sep 09	nicklas	128	}
5093	10 Sep 09	nicklas	129
5093	10 Sep 09	nicklas	130	/*
5093	10 Sep 09	nicklas	131	From the BaseFileSectionParser interface
5093	10 Sep 09	nicklas	132	----------------------------------------
5093	10 Sep 09	nicklas	133	*/
5093	10 Sep 09	nicklas	134	@Override
5093	10 Sep 09	nicklas	135	public void parseSection(BaseFileParser parser, FlatFileParser ffp)
5093	10 Sep 09	nicklas	136	throws IOException
5093	10 Sep 09	nicklas	137	{
5093	10 Sep 09	nicklas	138	++sectionCount;
5093	10 Sep 09	nicklas	139
5093	10 Sep 09	nicklas	140	// If this is the first 'spots' section we must load some
5093	10 Sep 09	nicklas	141	// info from the parent bioassay set so we can check if
5093	10 Sep 09	nicklas	142	// the position/reporter mapping has changed. If it has
5093	10 Sep 09	nicklas	143	// we are going to need a new datacube.
5093	10 Sep 09	nicklas	144	if (sectionCount == 1)
5093	10 Sep 09	nicklas	145	{
5093	10 Sep 09	nicklas	146	info.loadParentReporterPositions(dc, parent);
5093	10 Sep 09	nicklas	147	mapZeroToNull = info.getParentHasNullReporter() && !info.getParentHasZeroReporter();
5093	10 Sep 09	nicklas	148	}
5093	10 Sep 09	nicklas	149
5093	10 Sep 09	nicklas	150	String filename = info.getName();
5093	10 Sep 09	nicklas	151	ffp.parseHeaders();
5093	10 Sep 09	nicklas	152	int line = ffp.getParsedLines();
5093	10 Sep 09	nicklas	153	SpotSectionInfo ssInfo = new SpotSectionInfo();
5093	10 Sep 09	nicklas	154
5093	10 Sep 09	nicklas	155	// Get the "position", "reporter" and "assayData" columns from the "columns" header
5096	14 Sep 09	nicklas	156	List<String> columns = parser.getRequiredHeader(ffp, "columns", "\\t", SECTION, filename);
5096	14 Sep 09	nicklas	157	int positionIndex = parser.getRequiredIndex(columns,
5096	14 Sep 09	nicklas	158	parser.getRedefinedColumnName(SECTION, "position"), "columns", SECTION, line, filename);
5096	14 Sep 09	nicklas	159	int reporterIndex = parser.getRequiredIndex(columns,
5096	14 Sep 09	nicklas	160	parser.getRedefinedColumnName(SECTION, "reporter"), "columns", SECTION, line, filename);
5096	14 Sep 09	nicklas	161	int assayDataIndex = parser.getRequiredIndex(columns,
5096	14 Sep 09	nicklas	162	parser.getRedefinedColumnName(SECTION, "assayData"), "columns", SECTION, line, filename);
5093	10 Sep 09	nicklas	163
5093	10 Sep 09	nicklas	164	ssInfo.setPositionIndex(positionIndex);
5093	10 Sep 09	nicklas	165
5093	10 Sep 09	nicklas	166	// Check the "assays" header
5096	14 Sep 09	nicklas	167	List<String> assays = parser.getRequiredHeader(ffp, "assays", "\\t", SECTION, filename);
5093	10 Sep 09	nicklas	168	List<Integer> assayIds = new ArrayList<Integer>(assays.size());
5093	10 Sep 09	nicklas	169	for (String assay : assays)
5093	10 Sep 09	nicklas	170	{
5093	10 Sep 09	nicklas	171	Integer assayId = Values.getInt(assay);
5093	10 Sep 09	nicklas	172	assayIds.add(assayId);
5093	10 Sep 09	nicklas	173	if (info.addChildAssay(new ChildBioAssay(assayId)))
5093	10 Sep 09	nicklas	174	{
5093	10 Sep 09	nicklas	175	info.checkBioAssay(dc, assayId, line);
5093	10 Sep 09	nicklas	176	}
5093	10 Sep 09	nicklas	177	}
5093	10 Sep 09	nicklas	178	ssInfo.setAssays(assayIds);
5093	10 Sep 09	nicklas	179
5093	10 Sep 09	nicklas	180	// Check the "assayFields" header
5096	14 Sep 09	nicklas	181	List<String> assayFields = parser.getRequiredHeader(ffp, "assayFields", "\\t", SECTION, filename);
5093	10 Sep 09	nicklas	182	if (spotIntensityParsers != null)
5093	10 Sep 09	nicklas	183	{
5093	10 Sep 09	nicklas	184	for (SpotIntensityParser spiParser : spotIntensityParsers)
5093	10 Sep 09	nicklas	185	{
5096	14 Sep 09	nicklas	186	if (spiParser.hasRequiredAssayFields(parser, ffp, assayFields))
5093	10 Sep 09	nicklas	187	{
5093	10 Sep 09	nicklas	188	ssInfo.setSpotIntensityParser(spiParser);
5093	10 Sep 09	nicklas	189	break;
5093	10 Sep 09	nicklas	190	}
5093	10 Sep 09	nicklas	191	}
5093	10 Sep 09	nicklas	192	}
5093	10 Sep 09	nicklas	193	if (ssInfo.getSpotIntensityParser() == null)
5093	10 Sep 09	nicklas	194	{
5093	10 Sep 09	nicklas	195	throw new BaseException("Can't find any intensity columns in 'assayFields' header " +
5093	10 Sep 09	nicklas	196	"in section 'spots' at line " + line + " in file '" + filename + "'");
5093	10 Sep 09	nicklas	197	}
5093	10 Sep 09	nicklas	198	ssInfo.setAssayFields(assayFields);
5093	10 Sep 09	nicklas	199	ssInfo.setFirstAssayFieldIndex(assayDataIndex);
5093	10 Sep 09	nicklas	200
5093	10 Sep 09	nicklas	201	// Check the "setExtraFloats" header
5093	10 Sep 09	nicklas	202	String extraFloats = ffp.getHeader("setExtraFloats");
5093	10 Sep 09	nicklas	203	if (extraFloats != null)
5093	10 Sep 09	nicklas	204	{
5093	10 Sep 09	nicklas	205	List<String> extraFloatColumns = Arrays.asList(extraFloats.split("\\t"));
5093	10 Sep 09	nicklas	206	for (String ef : extraFloatColumns)
5093	10 Sep 09	nicklas	207	{
5093	10 Sep 09	nicklas	208	int index = assayFields.indexOf(ef);
5093	10 Sep 09	nicklas	209	if (index < 0)
5093	10 Sep 09	nicklas	210	{
5093	10 Sep 09	nicklas	211	throw new BaseException(
5093	10 Sep 09	nicklas	212	"Can't find extra float column '" + ef + "' in 'assayFields' " +
5093	10 Sep 09	nicklas	213	"header in section 'spots' at line " + line + " in file '" + filename + "'");
5093	10 Sep 09	nicklas	214	}
5093	10 Sep 09	nicklas	215	info.addExtraFloat(dc, ef);
5093	10 Sep 09	nicklas	216	ssInfo.addExtraFloatParser(new ExtraFloatParser(ef, index));
5093	10 Sep 09	nicklas	217	}
5093	10 Sep 09	nicklas	218	}
5093	10 Sep 09	nicklas	219	info.addSpotSectionInfo(sectionCount, ssInfo);
5093	10 Sep 09	nicklas	220
5093	10 Sep 09	nicklas	221	// Parse data and check if each position has same reporter as before
5093	10 Sep 09	nicklas	222	FlatFileParser.Data data;
5093	10 Sep 09	nicklas	223	while ((data = ffp.nextData()) != null)
5093	10 Sep 09	nicklas	224	{
5405	10 Sep 10	nicklas	225	ThreadSignalHandler.checkInterrupted();
5093	10 Sep 09	nicklas	226
5093	10 Sep 09	nicklas	227	long parsedBytes = ffp.getParsedBytes();
5093	10 Sep 09	nicklas	228	line = ffp.getParsedLines();
5093	10 Sep 09	nicklas	229	if (parsedBytes >= nextProgressReport)
5093	10 Sep 09	nicklas	230	{
5093	10 Sep 09	nicklas	231	nextProgressReport = parsedBytes + progressReportInterval;
5093	10 Sep 09	nicklas	232	parser.setProgress(parsedBytes, "Importing spot data (first pass): " +
5093	10 Sep 09	nicklas	233	Values.formatBytes(parsedBytes) + " of " + totalBytes);
5093	10 Sep 09	nicklas	234	}
5093	10 Sep 09	nicklas	235
5093	10 Sep 09	nicklas	236	// Get the position and reporter
7665	20 Mar 19	nicklas	237	Integer position = data.getInt(positionIndex);
7665	20 Mar 19	nicklas	238	Integer newReporterId = data.getInt(reporterIndex);
5093	10 Sep 09	nicklas	239	// Error if position is missing
5093	10 Sep 09	nicklas	240	if (position == null)
5093	10 Sep 09	nicklas	241	{
5093	10 Sep 09	nicklas	242	throw new BaseException("Missing or invalid value for 'position' " +
5093	10 Sep 09	nicklas	243	"in section 'spots' at line " + line + " in file '" + filename + "'");
5093	10 Sep 09	nicklas	244	}
5093	10 Sep 09	nicklas	245	// Convert 0 -> null since some BASE1 plug-ins convert null -> 0
5093	10 Sep 09	nicklas	246	if (mapZeroToNull && newReporterId != null && newReporterId.intValue() == 0)
5093	10 Sep 09	nicklas	247	{
5093	10 Sep 09	nicklas	248	newReporterId = null;
5093	10 Sep 09	nicklas	249	}
5093	10 Sep 09	nicklas	250
5093	10 Sep 09	nicklas	251	// Store this position->reporter mapping. If the same position has
5093	10 Sep 09	nicklas	252	// been mapped to a different reporter, throw an exception
5093	10 Sep 09	nicklas	253	Integer registeredReporterId = info.addChildReporter(position, newReporterId);
5093	10 Sep 09	nicklas	254	if (newReporterId != registeredReporterId)
5093	10 Sep 09	nicklas	255	{
5093	10 Sep 09	nicklas	256	throw new BaseException("Invalid value for 'reporter' (" + newReporterId +
5093	10 Sep 09	nicklas	257	") for position '" + position + "' in section 'spots' at line " +
5093	10 Sep 09	nicklas	258	line + " in file '" + filename +
5093	10 Sep 09	nicklas	259	"'. Expected '" + registeredReporterId + "'");
5093	10 Sep 09	nicklas	260	}
5093	10 Sep 09	nicklas	261	}
5093	10 Sep 09	nicklas	262	}
5093	10 Sep 09	nicklas	263	// ----------------------------------------------------
5093	10 Sep 09	nicklas	264
5093	10 Sep 09	nicklas	265	/**
5093	10 Sep 09	nicklas	266	Adds a spot intensity parser to the list of used parsers.
5093	10 Sep 09	nicklas	267	The first found parser that signals that all required assay fields
5093	10 Sep 09	nicklas	268	are present will be used to parse the spot intensity data.
5093	10 Sep 09	nicklas	269	@see GenericIntensityParser
5093	10 Sep 09	nicklas	270	@see MAParser
5093	10 Sep 09	nicklas	271	*/
5093	10 Sep 09	nicklas	272	public void addSpotIntensityParser(SpotIntensityParser parser)
5093	10 Sep 09	nicklas	273	{
5093	10 Sep 09	nicklas	274	if (spotIntensityParsers == null)
5093	10 Sep 09	nicklas	275	{
5093	10 Sep 09	nicklas	276	spotIntensityParsers = new LinkedList<SpotIntensityParser>();
5093	10 Sep 09	nicklas	277	}
5093	10 Sep 09	nicklas	278	spotIntensityParsers.add(parser);
5093	10 Sep 09	nicklas	279	}
5093	10 Sep 09	nicklas	280
5093	10 Sep 09	nicklas	281	}