diff --git a/README.md b/README.md index ac11046..e4bd73d 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ -[![Build Status](https://travis-ci.com/JULIELab/costosys.svg?branch=master)](https://travis-ci.com/JULIELab/costosys) +[![Build Status](https://travis-ci.com/JULIELab/costosys.svg?branch=master)](https://travis-ci.com/JULIELab/costosys) +[![Codacy Badge](https://api.codacy.com/project/badge/Grade/6c06345e4f6b4a18a0e38043f11c6e60)](https://app.codacy.com/app/khituras/costosys?utm_source=github.com&utm_medium=referral&utm_content=JULIELab/costosys&utm_campaign=Badge_Grade_Dashboard)[![Automated Release Notes by gren](https://img.shields.io/badge/%F0%9F%A4%96-release%20notes-00B2EE.svg)](https://github-tools.github.io/github-release-notes/) # CoStoSys The Corpus Storage System (CoStoSys) is a tool and abstraction layer for a PostgreSQL document database. @@ -46,11 +47,6 @@ have been predefined, including | medline_2017 | Defines the columns 'pmid' and 'xml'. Import data is expected to be in PubMed XML PubmedArticleSet format where one large XML file contains a bulk of PubMed articles. The individual articles must be located at XPath /PubmedArticleSet/PubmedArticle/MedlineCitation. This format is employed by the downloadable PubMed distribution since 2017. XML data are stored in GZIP format.| | medline_2016 | Defines the columns 'pmid' and 'xml'. Import data is expected to be in MEDLINE XML MedlineCitationSet format where one large XML file contains a bulk of MEDLINE articles. The individual articles must be located at XPath /MedlineCitationSet/MedlineCitation. This format was employed by the downloadable MEDLINE distribution until 2016. XML data are stored in GZIP format. | | pubmed_gzip | The same as medline_2017. | -| xmi_text | Used internally. Defines the columns 'pmid', 'xmi', 'max_xmi_id' and 'sofa_mapping'. Used by the JeDIS components [jcore-xmi-db-reader](https://github.com/JULIELab/jcore-base/tree/b2128199bd548dd989b0d7c198634ed79670e8c7/jcore-xmi-db-reader) and [jcore-xmi-db-writer](https://github.com/JULIELab/jcore-base/tree/b2128199bd548dd989b0d7c198634ed79670e8c7/jcore-xmi-db-writer) to read and store UIMA annotation graphs in XMI format that were segmented into annotation types with separate storage.| -| xmi_annotation | Used internally. Defines the columns 'pmid' and 'xmi'. This table schema is used for the annotation data segmented away from full XMI annotation graphs, see xmi_text. | -| xmi_text_gzip | Used internally. The same as xmi_text but the contents of the xmi column are stored an GZIP format.| -| max_id_addition | Used internally. Defines the fields 'pmid', 'xmi' and 'max_xmi_id' but only marks the 'max_xmi_id' column for retrieval. This schema is not supposed to be used for data import but for a table with xmi_text schema for which only the current maximum XMI ID should be retrieved. Technical detail of the JeDIS architecture.| -| xmi_annotation_gzip | Used internally. The same as xmi_annotation but with the XMI data in GZIP format.| Custom table schema may be added to the configuration at XPath `/databaseConnectorConfiguration/DBSchemaInformation/tableSchemas`. Refer to docbook documentation and the XML schema for details. diff --git a/pom.xml b/pom.xml index 8d93d8c..de8d6c8 100644 --- a/pom.xml +++ b/pom.xml @@ -2,7 +2,7 @@ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> 4.0.0 costosys - 1.3.2 + 1.4.0-SNAPSHOT Corpus Storage System A utility for managing documents stored in a PostgreSQL database. The documents are imported into a PostgreSQL DB as full texts with the goal to be able to retrieve the documents by their PubMedID efficiently. @@ -21,7 +21,7 @@ - de.julielab.xmlData.cli.CLI + de.julielab.costosys.cli.CLI diff --git a/src/main/java/de/julielab/costosys/Constants.java b/src/main/java/de/julielab/costosys/Constants.java new file mode 100644 index 0000000..27827bd --- /dev/null +++ b/src/main/java/de/julielab/costosys/Constants.java @@ -0,0 +1,116 @@ +/** + * Constants.java + * + * Copyright (c) 2010, JULIE Lab. + * All rights reserved. This program and the accompanying materials + * are made available under the terms of the Common Public License v1.0 + * + * Author: faessler + * + * Current version: 1.0 + * Since version: 1.0 + * + * Creation date: 19.11.2010 + **/ + +package de.julielab.costosys; + +/** + * This class provides Constants useful for common tasks. Examples include + * database field names for the import or retrieval of Medline documents, table + * names etc. + * + * @author faessler + */ +public final class Constants { + + // Field attribute names + + /** + * The default PostgreSQL schema in which all data related tables are + * stored. The schema is {@value #DEFAULT_DATA_SCHEMA}. + */ + public static final String DEFAULT_DATA_SCHEMA = "_data"; + + /** + * Constant for the name of a database table holding at least document ID + * and document data (e.g. PubmedId and Medline XML). Value: + * {@value #DEFAULT_DATA_TABLE_NAME}. + */ + public static final String DEFAULT_DATA_TABLE_NAME = DEFAULT_DATA_SCHEMA + + "._data"; + + public static final String VALUE = "value"; + + // SQL type constants + + public static final String TYPE_TEXT = "text"; + + public static final String TYPE_TEXT_ARRAY = "text[]"; + + public static final String TYPE_VARCHAR_ARRAY = "varchar[]"; + + public static final String TYPE_BINARY_DATA = "bytea"; + + /** + * Constant for a possible value of type. + *

+ * Used to to create a timestamp without time zone. + */ + public static final String TYPE_TIMESTAMP_WITHOUT_TIMEZONE = "timestamp without time zone"; + + public static final String TYPE_INTEGER = "integer"; + + public static final String TYPE_BOOLEAN = "boolean"; + + public static final String TYPE_XML = "xml"; + + public static final String XML_FIELD_NAME = "xml"; + + public static final String PMID_FIELD_NAME = "pmid"; + + public static final String DATE_FIELD_NAME = "date"; + + public static final String NLM_ID_FIELD_NAME = "nlm_id"; + + public static final String AUTO_ID_FIELD_NAME = "autoID"; + + public static final String HAS_ERRORS = "has_errors"; + + public static final String LOG = "log"; + + public static final String IN_PROCESS = "is_in_process"; + + public static final String IS_PROCESSED = "is_processed"; + + public static final String LAST_COMPONENT = "last_component"; + + public static final String HOST_NAME = "host_name"; + + public static final String PROCESSING_TIMESTAMP = "processing_timestamp"; + + public static final String PID = "pid"; + + @Deprecated + public static final String DOC_ID_FIELD_NAME = "doc_id"; + + public static final String PROCESSED = "is_processed"; + + public static final String HIDDEN_CONFIG_PATH = "dbcTest.hiddenConfigPath"; + + public static final String COSTOSYS_CONFIG_FILE = "costosys.configurationfile"; + + public static final String MIRROR_COLLECTION_NAME = "public._mirrorSubsets"; + + public static final String MIRROR_COLUMN_DATA_TABLE_NAME = "datatablename"; + + public static final String MIRROR_COLUMN_SUBSET_NAME = "subsettablename"; + + public static final String MIRROR_COLUMN_DO_RESET = "performreset"; + + public static final String TIMESTAMP_FIELD_NAME = "timestamp"; + + public static final String TOTAL = "total"; + + +} diff --git a/src/main/java/de/julielab/costosys/cli/CLI.java b/src/main/java/de/julielab/costosys/cli/CLI.java new file mode 100644 index 0000000..122cb52 --- /dev/null +++ b/src/main/java/de/julielab/costosys/cli/CLI.java @@ -0,0 +1,1063 @@ +/** + * QueryCLI.java + *

+ * Copyright (c) 2010, JULIE Lab. + * All rights reserved. This program and the accompanying materials + * are made available under the terms of the Common Public License v1.0 + *

+ * Author: faessler + *

+ * Current version: 1.0 + * Since version: 1.0 + *

+ * Creation date: 20.11.2010 + **/ + +package de.julielab.costosys.cli; + +import de.julielab.costosys.Constants; +import de.julielab.costosys.dbconnection.SubsetStatus; +import de.julielab.costosys.medline.ConfigurationConstants; +import de.julielab.costosys.medline.Updater; +import de.julielab.xml.JulieXMLConstants; +import de.julielab.xml.JulieXMLTools; +import de.julielab.costosys.configuration.TableSchemaDoesNotExistException; +import de.julielab.costosys.dbconnection.CoStoSysConnection; +import de.julielab.costosys.dbconnection.DataBaseConnector; +import org.apache.commons.cli.*; +import org.apache.commons.configuration2.XMLConfiguration; +import org.apache.commons.configuration2.builder.FileBasedConfigurationBuilder; +import org.apache.commons.configuration2.builder.fluent.Parameters; +import org.apache.commons.configuration2.ex.ConfigurationException; +import org.apache.commons.lang3.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.*; +import java.sql.SQLException; +import java.util.*; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import java.util.stream.Stream; + +import static de.julielab.costosys.dbconnection.DataBaseConnector.StatusElement.*; + +/** + * Command line interface for interaction with a databases holding e.g. Medline + * XML data. + * + * @author faessler / hellrich + */ +public class CLI { + + private final static String DELIMITER = "\n--------------------------------------------------------------------------------\n"; + + private static final Logger LOG = LoggerFactory.getLogger(CLI.class); + private static final String KEY_PART_SEPERATOR = "\t"; + private static final String FILE_SEPERATOR = System.getProperty("file.separator"); + public static String[] USER_SCHEME_DEFINITION = new String[]{"dbcconfiguration.xml", "costosys.xml", "costosysconfiguration.xml"}; + private static boolean verbose = false; + + private static void logMessage(String msg) { + if (!verbose) + return; + LOG.info(msg); + } + + public static void main(String[] args) throws Exception { + long time = System.currentTimeMillis(); + String dbUrl; + String user; + String password; + String dbName; + String serverName; + String pgSchema; + String msg; + boolean updateMode = false; + + boolean error = false; + Mode mode = Mode.ERROR; + + Options options = getOptions(); + + // What has to be done + CommandLineParser parser = new DefaultParser(); + CommandLine cmd = null; + try { + cmd = parser.parse(options, args); + } catch (ParseException e) { + LOG.error("Can't parse arguments: " + e.getMessage()); + printHelp(options); + System.exit(1); + } + + verbose = cmd.hasOption('v'); + if (verbose) + LOG.info("Verbose logging enabled."); + + // selecting the mode + if (cmd.hasOption("h")) + error = true; // To show help + if (cmd.hasOption("i")) + mode = Mode.IMPORT; + if (cmd.hasOption("u")) { + mode = Mode.IMPORT; + updateMode = true; + } + if (cmd.hasOption("q")) + mode = Mode.QUERY; + if (cmd.getOptionValue("s") != null) + mode = Mode.SUBSET; + if (cmd.getOptionValue("re") != null) + mode = Mode.RESET; + if (cmd.getOptionValue("st") != null) + mode = Mode.STATUS; + if (cmd.hasOption("t")) + mode = Mode.TABLES; + if (cmd.hasOption("lts")) + mode = Mode.LIST_TABLE_SCHEMAS; + if (cmd.hasOption("td")) + mode = Mode.TABLE_DEFINITION; + if (cmd.hasOption("sch")) + mode = Mode.SCHEME; + if (cmd.hasOption("ch")) + mode = Mode.CHECK; + if (cmd.hasOption("dc")) + mode = Mode.DEFAULT_CONFIG; + if (cmd.hasOption("dt")) + mode = Mode.DROP_TABLE; + if (cmd.hasOption("um")) + mode = Mode.UPDATE_MEDLINE; + + // authentication + // configuration file + String dbcConfigPath = null; + if (cmd.hasOption("dbc")) + dbcConfigPath = cmd.getOptionValue("dbc"); + if (dbcConfigPath == null) + dbcConfigPath = findConfigurationFile(); + File conf = new File(dbcConfigPath); + dbUrl = cmd.getOptionValue('U'); + if (dbUrl == null) { + msg = "No database URL given. Using value in configuration file"; + logMessage(msg); + } + user = cmd.getOptionValue("n"); + if (user == null) { + msg = "No database username given. Using value in configuration file"; + logMessage(msg); + } + password = cmd.getOptionValue("p"); + if (password == null) { + msg = "No password given. Using value in configuration file"; + logMessage(msg); + } + serverName = cmd.getOptionValue("srv"); + dbName = cmd.getOptionValue("db"); + pgSchema = cmd.getOptionValue("pgs"); + if (!((serverName != null && dbName != null) ^ dbUrl != null) + && !(serverName == null && dbName == null && dbUrl == null) && !conf.exists()) { + LOG.error( + "No base configuration has been found. Thus, you must specify server name and database name or the complete URL with -u (but not both)."); + System.exit(1); + } + + DataBaseConnector dbc = null; + try { + if (conf.exists()) { + logMessage(String.format("Using configuration file at %s", conf)); + if (dbUrl == null) + dbc = new DataBaseConnector(serverName, dbName, user, password, pgSchema, + new FileInputStream(conf)); + else + dbc = new DataBaseConnector(dbUrl, user, password, pgSchema, new FileInputStream(conf)); + } else { + logMessage(String.format( + "No custom configuration found (should be located at %s). Using default configuration.", + Stream.of(USER_SCHEME_DEFINITION).collect(Collectors.joining(" or ")))); + if (dbUrl == null) + dbc = new DataBaseConnector(serverName, dbName, user, password, pgSchema, null); + else + dbc = new DataBaseConnector(dbUrl, user, password, pgSchema, null); + } + } catch (FileNotFoundException e) { + e.printStackTrace(); + } + + // all those options... + String tableName = cmd.getOptionValue("td"); + if (tableName == null) + tableName = cmd.getOptionValue("ch"); + + String subsetTableName = cmd.getOptionValue("s"); + if (subsetTableName == null) + subsetTableName = cmd.getOptionValue("re"); + if (subsetTableName == null) + subsetTableName = cmd.getOptionValue("renp"); + if (subsetTableName == null) + subsetTableName = cmd.getOptionValue("st"); + + String fileStr = cmd.getOptionValue("f"); + if (fileStr == null) + fileStr = cmd.getOptionValue("i"); + if (fileStr == null) + fileStr = cmd.getOptionValue("u"); + if (cmd.hasOption("im")) { + mode = Mode.IMPORT; + // For some reasons, multuple versions of some documents have been found in the baseline in the past. + // Just use the update mode. + XMLConfiguration importConfig = loadXmlConfiguration(new File(cmd.getOptionValue("im"))); + fileStr = importConfig.getString(ConfigurationConstants.INSERTION_INPUT); + updateMode = true; + } + + String superTableName = cmd.getOptionValue("z"); + if (superTableName == null) + superTableName = dbc.getActiveDataTable(); + + String queryStr = cmd.getOptionValue("q"); + String subsetJournalFileName = cmd.getOptionValue("j"); + String subsetQuery = cmd.getOptionValue("o"); + String randomSubsetSize = cmd.getOptionValue("r"); + String whereClause = cmd.getOptionValue("w"); + String xpath = cmd.getOptionValue("x"); + String baseOutDir = cmd.getOptionValue("out"); + String batchSize = cmd.getOptionValue("bs"); + String limit = cmd.getOptionValue("l"); + String tableSchema = cmd.getOptionValue("ts") != null ? cmd.getOptionValue("ts") : dbc.getActiveTableSchema(); + boolean useDelimiter = baseOutDir != null ? false : cmd.hasOption("d"); + boolean returnPubmedArticleSet = cmd.hasOption("pas"); + boolean mirrorSubset = cmd.hasOption("m"); + boolean all4Subset = cmd.hasOption("a"); + Integer numberRefHops = cmd.hasOption("rh") ? Integer.parseInt(cmd.getOptionValue("rh")) : null; + + if (tableSchema.matches("[0-9]+")) { + tableSchema = dbc.getConfig().getTableSchemaNames().get(Integer.parseInt(tableSchema)); + } + + try (CoStoSysConnection conn = dbc.obtainOrReserveConnection()) { + switch (mode) { + case QUERY: + QueryOptions qo = new QueryOptions(); + qo.fileStr = fileStr; + qo.queryStr = queryStr; + qo.useDelimiter = useDelimiter; + qo.pubmedArticleSet = returnPubmedArticleSet; + qo.xpath = xpath; + qo.baseOutDirStr = baseOutDir; + qo.batchSizeStr = batchSize; + qo.limitStr = limit; + qo.tableName = superTableName; + qo.tableSchema = tableSchema; + qo.whereClause = whereClause; + qo.numberRefHops = numberRefHops; + error = doQuery(dbc, qo); + break; + + case IMPORT: + error = doImportOrUpdate(dbc, fileStr, queryStr, superTableName, updateMode); + break; + + case SUBSET: + error = doSubset(dbc, subsetTableName, fileStr, queryStr, superTableName, subsetJournalFileName, + subsetQuery, mirrorSubset, whereClause, all4Subset, randomSubsetSize, numberRefHops); + break; + + case RESET: + if (subsetTableName == null) { + LOG.error("You must provide the name of the subset table to reset."); + error = true; + } else { + boolean files = cmd.hasOption("f"); + try { + if (!files || StringUtils.isBlank(fileStr)) { + boolean np = cmd.hasOption("np"); + boolean ne = cmd.hasOption("ne"); + String lc = cmd.hasOption("lc") ? cmd.getOptionValue("lc") : null; + if (np) + logMessage("table reset is restricted to non-processed table rows"); + if (ne) + logMessage("table reset is restricted to table row without errors"); + if (lc != null) + logMessage("table reset is restricted to rows with last component " + lc); + if (!np && !ne && lc == null) { + SubsetStatus status = dbc.status(subsetTableName, EnumSet.of(IN_PROCESS, IS_PROCESSED, TOTAL)); + long inProcess = status.inProcess; + long isProcessed = status.isProcessed; + long total = status.total; + // We don't bother with too small datasets, worst + // case would be to do it again for 10000 docs which + // is not much. + if (total > 10000 && inProcess + isProcessed >= total / 2) { + String input = getYesNoAnswer("The subset table \"" + subsetTableName + + "\" is in process or already processed over 50%." + + " Do you really wish to reset it completely into an unprocessed state? (yes/no)"); + if (input.equals("no")) + System.exit(0); + } + } + dbc.resetSubset(subsetTableName, np, ne, lc); + } else { + logMessage("Resetting all documents identified by the IDs in file \"" + fileStr + "\"."); + try { + List pkValues = asListOfArrays(fileStr); + dbc.resetSubset(subsetTableName, pkValues); + } catch (IOException e) { + e.printStackTrace(); + } + } + } catch (TableNotFoundException e) { + e.printStackTrace(); + } + } + break; + case STATUS: + error = doStatus(dbc, + subsetTableName, + cmd.hasOption("he"), + cmd.hasOption("isp"), + cmd.hasOption("inp"), + cmd.hasOption("to"), + cmd.hasOption("lc")); + break; + + case TABLES: + for (String s : dbc.getTables()) + System.out.println(s); + break; + + case TABLE_DEFINITION: + for (String s : dbc.getTableDefinition(tableName)) + System.out.println(s); + break; + + case LIST_TABLE_SCHEMAS: + System.out.println("The following table schema names are contained in the current configuration:\n"); + List tableSchemaNames = dbc.getConfig().getTableSchemaNames(); + IntStream.range(0, tableSchemaNames.size()).mapToObj(i -> i + " " + tableSchemaNames.get(i)) + .forEach(System.out::println); + break; + + case SCHEME: + System.out.println(dbc.getScheme()); + break; + + case CHECK: + dbc.checkTableDefinition(tableName); + break; + + case DEFAULT_CONFIG: + System.out.println(new String(dbc.getEffectiveConfiguration())); + break; + + case DROP_TABLE: + dropTableInteractively(dbc, cmd.getOptionValue("dt")); + break; + + case UPDATE_MEDLINE: + Updater updater = new Updater(loadXmlConfiguration(new File(cmd.getOptionValue("um")))); + updater.process(dbc); + break; + + case ERROR: + break; + } + } + + if (error) { + // printHelp(options); + System.exit(1); + } + + time = System.currentTimeMillis() - time; + LOG.info(String.format("Processing took %d seconds.", time / 1000)); + } + + public static String findConfigurationFile() throws ConfigurationNotFoundException { + String configFileProperty = System.getProperty(Constants.COSTOSYS_CONFIG_FILE); + if (configFileProperty != null && new File(configFileProperty).exists()) + return configFileProperty; + File workingDirectory = new File("."); + Set possibleConfigFileNames = new HashSet<>(Arrays.asList(USER_SCHEME_DEFINITION)); + for (String file : workingDirectory.list()) { + if (possibleConfigFileNames.contains(file.toLowerCase())) + return file; + } + throw new ConfigurationNotFoundException("No configuration file with a name in " + Arrays.toString(USER_SCHEME_DEFINITION) + " was found in the current working directory " + new File(".").getAbsolutePath()); + } + + private static void dropTableInteractively(DataBaseConnector dbc, String tableName) { + try { + if (!dbc.tableExists(tableName)) { + if (tableName.contains(".")) + System.err + .println("Table \"" + tableName + "\" does not exist in database " + dbc.getDbURL() + "."); + else + System.err.println("Table \"" + tableName + "\" does not exist in database " + dbc.getDbURL() + + " in active schema " + dbc.getActivePGSchema() + "."); + return; + } else { + String unqualifiedTableName = tableName.contains(".") ? tableName.substring(tableName.indexOf(".") + 1) + : tableName; + String schema = tableName.contains(".") ? tableName.substring(0, tableName.indexOf(".")) + : dbc.getActivePGSchema(); + System.out.println("Found table \"" + unqualifiedTableName + "\" in schema " + schema + " in database " + + dbc.getDbURL() + ". Do you really want to drop it (y/n)?"); + BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); + String response = in.readLine().toLowerCase(); + while (!response.equals("y") && !response.equals("yes") && !response.equals("n") + && !response.equals("no")) { + System.out.println("Please specify y(es) or n(o)."); + response = in.readLine().toLowerCase(); + } + if (response.startsWith("y")) { + System.out.println("Dropping table \"" + unqualifiedTableName + "\" in Postgres schema \"" + schema + + "\" of database " + dbc.getDbURL()); + dbc.dropTable(String.join(".", schema, unqualifiedTableName)); + } else { + System.out.println("User canceled. Aborting process."); + } + } + } catch (IOException | SQLException e) { + e.printStackTrace(); + } + } + + /** + * Poses question to the user and awaits for a yes or + * no answer and returns it. + * + * @param question the question raised + * @return the answer yes or no + */ + private static String getYesNoAnswer(String question) { + BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); + String input = ""; + try { + while (!input.equals("yes") && !input.equals("no")) { + System.out.println(question); + input = br.readLine(); + } + } catch (IOException e) { + LOG.error("Something went wrong while reading from STDIN: ", e); + System.exit(1); + } + return input; + } + + private static boolean doStatus(DataBaseConnector dbc, String subsetTableName, boolean showHasErrors, boolean showIsProcessed, boolean showIsInProcess, boolean showTotal, boolean showLastComponent) { + boolean error = false; + try { + if (subsetTableName == null) { + LOG.error("You must provide the name of a subset table to display its status."); + error = true; + } else { + EnumSet modes = EnumSet.noneOf(DataBaseConnector.StatusElement.class); + if (showHasErrors) modes.add(DataBaseConnector.StatusElement.HAS_ERRORS); + if (showIsProcessed) modes.add(DataBaseConnector.StatusElement.IS_PROCESSED); + if (showIsInProcess) modes.add(DataBaseConnector.StatusElement.IN_PROCESS); + if (showTotal) modes.add(DataBaseConnector.StatusElement.TOTAL); + if (showLastComponent) modes.add(DataBaseConnector.StatusElement.LAST_COMPONENT); + if (modes.isEmpty()) + modes = EnumSet.allOf(DataBaseConnector.StatusElement.class); + + try (CoStoSysConnection ignored = dbc.obtainOrReserveConnection()) { + SubsetStatus status = dbc.status(subsetTableName, modes); + System.out.println(status); + } + } + } catch (TableSchemaDoesNotExistException e) { + LOG.error(e.getMessage()); + error = true; + } catch (TableNotFoundException e) { + LOG.error(e.getMessage()); + e.printStackTrace(); + } + return error; + } + + private static boolean doSubset(DataBaseConnector dbc, String subsetTableName, String fileStr, String queryStr, + String superTableName, String subsetJournalFileName, String subsetQuery, boolean mirrorSubset, + String whereClause, boolean all4Subset, String randomSubsetSize, Integer numberRefHops) + throws SQLException { + String comment = ""; + boolean error; + ArrayList ids = null; + String condition = null; + + error = checkSchema(dbc, subsetTableName); + if (!error) { + if (subsetJournalFileName != null) { + try { + ids = asList(subsetJournalFileName); + } catch (IOException e) { + e.printStackTrace(); + } + if (ids.size() == 0) { + LOG.error(subsetJournalFileName + " is empty."); + error = true; + } + StringBuilder sb = new StringBuilder(); + for (String id : ids) + sb.append(", ").append(id); + condition = Constants.NLM_ID_FIELD_NAME; + comment = "Subset created " + new Date().toString() + " by matching with " + superTableName + " on " + + condition + ": " + sb.substring(2); + } else if (subsetQuery != null) { + logMessage("Querying PubMed for: " + subsetQuery); + ids = QueryPubMed.query(subsetQuery); + if (ids.size() == 0) { + LOG.error("No results for your query."); + error = true; + } else + LOG.info("PubMed delivered " + ids.size() + " results."); + condition = Constants.PMID_FIELD_NAME; + comment = "Subset created " + new Date().toString() + " by matching with " + superTableName + + " on PubMed-query: " + subsetQuery; + } else if (all4Subset) { + logMessage("Creating subset by matching all entries from table " + superTableName + "."); + comment = "Subset created " + new Date().toString() + " by matching with " + superTableName; + } else if (whereClause != null) { + comment = "Subset created " + new Date().toString() + " by selecting rows from " + superTableName + + " with where clause \"" + whereClause + "\""; + } else if (randomSubsetSize != null) { + try { + new Integer(randomSubsetSize); + comment = "Subset created " + new Date().toString() + " by randomly selecting " + randomSubsetSize + + " rows from " + superTableName + "."; + } catch (NumberFormatException e) { + LOG.error(randomSubsetSize + " is not a number!"); + error = true; + } + } else if (fileStr != null) { + try { + ids = asList(fileStr); + } catch (IOException e) { + e.printStackTrace(); + } + if (ids.size() == 0) { + LOG.error(fileStr + " is empty."); + error = true; + } + condition = dbc.getFieldConfiguration(dbc.getActiveTableSchema()).getPrimaryKey()[0]; + comment = "Subset created " + new Date().toString() + " by matching with " + superTableName + " on " + + ids.size() + " " + condition + "s;"; + } else if (mirrorSubset) { + comment = "Subset created " + new Date().toString() + " as to mirror " + superTableName + ";"; + } else { + error = true; + LOG.error("You must choose a way to define the subset."); + } + + comment = escapeSingleQuotes(comment); + } + if (!dbc.withConnectionQueryBoolean(c -> dbc.tableExists(superTableName))) { + logMessage("Checking whether super table " + superTableName + " exists..."); + LOG.error("Table " + superTableName + " doesn't exist!"); + error = true; + } + if (!error) { + try (CoStoSysConnection connPair = dbc.obtainOrReserveConnection()) { + if (!dbc.tableExists(subsetTableName)) { + logMessage("No table with the name \"" + subsetTableName + "\" exists, creating new subset table..."); + dbc.createSubsetTable(subsetTableName, superTableName, numberRefHops, comment); + logMessage("Created table " + subsetTableName); + } else + LOG.error("Table " + subsetTableName + " allready exists."); + if (dbc.isEmpty(subsetTableName) && !error) { + if (all4Subset) + dbc.initSubset(subsetTableName, superTableName); + else if (whereClause != null) + dbc.initSubsetWithWhereClause(subsetTableName, superTableName, whereClause); + else if (ids != null && ids.size() > 0) + dbc.initSubset(ids, subsetTableName, superTableName, condition); + else if (mirrorSubset) + dbc.initMirrorSubset(subsetTableName, superTableName, true); + else if (randomSubsetSize != null) { + dbc.initRandomSubset(new Integer(randomSubsetSize), subsetTableName, superTableName); + } + logMessage("Subset defined."); + } else { + LOG.error(subsetTableName + " is not empty, please use another table."); + error = true; + } + } + } + return error; + } + + private static boolean doImportOrUpdate(DataBaseConnector dbc, String fileStr, String queryStr, + String superTableName, boolean updateMode) throws SQLException { + boolean error = false; + if (fileStr != null) { + + if (!dbc.withConnectionQueryBoolean(c -> c.tableExists(superTableName))) { + error = checkSchema(dbc, superTableName); + final String comment = "Data table created " + new Date().toString() + " by importing data from path " + fileStr; + if (!error) { + dbc.withConnectionExecute(c -> c.createTable(superTableName, comment)); + logMessage("Created table " + superTableName); + + } + } + + if (dbc.withConnectionQueryBoolean(c -> c.isEmpty(superTableName)) && !updateMode) { + dbc.withConnectionExecute(c -> c.importFromXMLFile(fileStr, superTableName)); + } else { + logMessage("Table is not empty or update mode was explicitly specified, processing Updates."); + dbc.withConnectionExecute(c -> c.updateFromXML(fileStr, superTableName)); + logMessage("Updates finished."); + } + } else { + LOG.error("You must specify a file or directory to retrieve XML files from."); + error = true; + } + return error; + } + + private static boolean doQuery(DataBaseConnector dbc, QueryOptions qo) { + boolean error = false; + + /** + * The document IDs that should be returned (optional) + */ + String queryStr = qo.queryStr; + String fileStr = qo.fileStr; + String tableName = qo.tableName; + String tableSchema = qo.tableSchema; + boolean useDelimiter = qo.useDelimiter; + boolean pubmedArticleSet = qo.pubmedArticleSet; + String xpath = qo.xpath; + // this could be a directory or file name, depending on parameters + String baseOutFile = qo.baseOutDirStr; + String batchSizeStr = qo.batchSizeStr; + String limitStr = qo.limitStr; + Integer numberRefHops = qo.numberRefHops; + + // In the following algorithm, first of all each possible + // parameter/resource is acquired. Further down is then one single + // algorithm iterating over queried documents and treating them + // accordingly to the parameters which have been found. + File outfile = null; + int batchSize = 0; + BufferedWriter bw = null; + boolean keysExplicitlyGiven = fileStr != null || queryStr != null; + long limit = limitStr != null ? Integer.parseInt(limitStr) : -1; + + boolean createDirectory = baseOutFile != null && !pubmedArticleSet; + if (verbose) { + logMessage("Creating " + (createDirectory ? "directory" : "file") + " " + baseOutFile + + " to write query results to."); + } + + if (createDirectory) { + outfile = new File(baseOutFile); + if (!outfile.exists()) { + logMessage("Directory " + outfile.getAbsolutePath() + + " does not exist and will be created (as well as sub dircetories for file batches if required)."); + outfile.mkdir(); + } + logMessage("Writing queried documents to " + outfile.getAbsolutePath()); + + if (batchSizeStr != null) { + try { + batchSize = Integer.parseInt(batchSizeStr); + logMessage("Dividing query result files in batches of " + batchSize); + if (batchSize < 1) + throw new NumberFormatException(); + } catch (NumberFormatException e) { + LOG.error( + "Error parsing \"{}\" into an integer. Please deliver a positive numeric value for the batch size of files."); + } + } + } + + if (!error) { + List keys = new ArrayList(); + if (fileStr != null) { + try { + keys = asListOfArrays(fileStr); + } catch (IOException e1) { + LOG.error("Could not open '" + new File(fileStr).getAbsolutePath() + "'."); + error = true; + } + } + if (queryStr != null) { + for (String pmid : queryStr.split(",")) + keys.add(pmid.split(KEY_PART_SEPERATOR)); + } + + // Main algorithm iterating over documents. + try { + if (!error) { + Iterator it; + if (!keysExplicitlyGiven) { + it = dbc.querySubset(tableName, qo.whereClause, limit, numberRefHops, tableSchema); + } else if (keys.size() > 0) + it = dbc.retrieveColumnsByTableSchema(keys, tableName, tableSchema); + else + throw new IllegalStateException( + "No query keys have been explicitly given (e.g. in a file) nor should the whole table be queried."); + int i = 0; + // The name of the sub directories will just be their batch + // number. We start at -1 because the batchNumber will be + // incremented first of all (0 % x == 0, Ax). + int batchNumber = -1; + // outDir will be baseOutDir plus the current batch number + // of files when + // saving the queried files in separate batches is wished. + File outDir = outfile; + + if (pubmedArticleSet) { + if (null != baseOutFile) { + logMessage( + "Creating a single file with a PubmedArticleSet and writing it to " + baseOutFile); + bw = new BufferedWriter(new FileWriter(baseOutFile)); + } + print("\n" + + "\n" + + "", bw); + } + + while (it.hasNext()) { + byte[][] idAndXML = it.next(); + if (outfile != null) { + // if we want batches, create appropriate + // subdirectories + if (batchSize > 0 && i % batchSize == 0) { + ++batchNumber; + // Adjust the sub directory for the new batch. + String subDirectoryName = (batchNumber > -1 && batchSize > 0 + ? Integer.toString(batchNumber) : ""); + String subDirPath = outfile.getAbsolutePath() + FILE_SEPERATOR + subDirectoryName; + outDir = new File(subDirPath); + outDir.mkdir(); + } + + // Write the current file into the given directory + // and use the key as file name + String filename = new String(idAndXML[0]); + + if (!pubmedArticleSet) { + if (bw != null) + bw.close(); + bw = new BufferedWriter(new FileWriter(outDir + FILE_SEPERATOR + filename)); + } + } + if (xpath == null) { + StringBuilder sb = new StringBuilder(); + if (pubmedArticleSet) + sb.append("\n"); + sb.append(new String(idAndXML[1], "UTF-8")); + if (pubmedArticleSet) + sb.append("\n"); + print(sb.toString(), bw); + } else { + // 'values' contains for each XPath delivered one + // array of Strings holding the values for this + // XPath (e.g. the AuthorList mostly yields several + // values). + String[][] values = getXpathValues(idAndXML[1], xpath); + for (String[] valuesOfXpath : values) + for (String singleValue : valuesOfXpath) + print(singleValue, bw); + } + if (useDelimiter) + System.out.println(DELIMITER); + ++i; + + } + + if (pubmedArticleSet) { + print("", bw); + } + } + } catch (IOException e) { + e.printStackTrace(); + } catch (SQLException e) { + e.printStackTrace(); + } finally { + try { + if (bw != null) + bw.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + + return error; + } + + /** + * @param string + * @param bw + * @throws IOException + */ + private static void print(String string, BufferedWriter bw) throws IOException { + if (bw == null) + System.out.println(string); + else + bw.write(string + "\n"); + } + + private static String[][] getXpathValues(byte[] next, String xpaths) { + + String[] xpathArray = xpaths.split(","); + List> fields = new ArrayList>(); + for (String xpath : xpathArray) { + Map field = new HashMap(); + field.put(JulieXMLConstants.NAME, xpath); + field.put(JulieXMLConstants.XPATH, xpath); + field.put(JulieXMLConstants.RETURN_XML_FRAGMENT, "true"); + field.put(JulieXMLConstants.RETURN_ARRAY, "true"); + fields.add(field); + } + + String[][] retStrings = new String[xpathArray.length][]; + + Iterator> it = JulieXMLTools.constructRowIterator(next, 1024, ".", fields, "your result"); + if (it.hasNext()) { + Map row = it.next(); + for (int i = 0; i < xpathArray.length; i++) { + // Get the field "xpath" which was given as field name above; we + // wanted multiple results to be returned in an array. + String[] values = (String[]) row.get(xpathArray[i]); + if (values == null) + values = new String[]{"XPath " + xpaths + " does not exist in this document."}; + retStrings[i] = values; + } + if (it.hasNext()) { + // What happened? We wanted all values in one array, so this + // should not happen. + LOG.warn( + "There are more results for the XPath {} then expected and not all have been returned. Please contact a developer for help.", + xpaths); + } + } + return retStrings; + } + + private static void printHelp(Options options) { + HelpFormatter formatter = new HelpFormatter(); + formatter.setWidth(160); + formatter.printHelp(CLI.class.getName(), options); + } + + private static Options getOptions() { + Options options = new Options(); + + // -------------------- OptionGroup for available modes -------------- + OptionGroup modes = new OptionGroup(); + + modes.addOption(buildOption("i", "import", "Import data into the _data table", "file/dir to import")); + modes.addOption(buildOption("im", "importmedline", "Import PubMed/MEDLINE data into the _data table. The parameter is a XML file holding information about the PubMed/MEDLINE baseline location. It is the same file format used for the -um mode.", "XML MEDLINE configuration")); + modes.addOption(buildOption("u", "update", "Update _data table", "file/dir to update from")); + modes.addOption(buildOption("um", "updatemedline", "Update _data table from PubMed/MEDLINE update files. Keeps track of already applied update files via an internal table. The parameter is a XML file holding information about the update file location. It is the same file format used for the -im mode.", "XML MEDLINE configuration")); + modes.addOption(buildOption("s", "subset", + "Define a subset table; use -f, -o, -a, -m, -w or -r to specify the subsets source.", + "name of the new subset table")); + modes.addOption(buildOption("re", "reset", + "Resets a subset table to a not-yet-processed state. Flags:\n" + "-np only reset non-processed items\n" + + "-ne only reset items without errors\n" + + "-lc to reset only those items with the given last component\n" + + "-f a partial reset can be achieved by specifying a file containing one primary key value for each document to be resetted", + "subset table name")); + modes.addOption( + buildOption("st", "status", "Show the processing status of a subset table. Generates a small report containing the number of processed and total documents of a subset table. " + + "The report can be customized using the -he, -isp, -inp, -to and -slc switches", "subset table name")); + + OptionBuilder.withLongOpt("query"); + OptionBuilder.withDescription("Query a table (default: " + Constants.DEFAULT_DATA_TABLE_NAME + + ") for XMLs. You can enter the primary keys directly or use -f to specify a file. If you define none of both, the whole table will be returned.\n" + + "Use -d to display delimiters between the results.\n" + + "Use -z to specify the target table. If the table is a subset, only documents in this subset will be returned.\n" + + "Use -l to set a limit of returned documents.\n" + + "Use -x to specify an XPath expression go extract specific parts of the queried XML documents.\n" + + "Use -out to save the query results to file."); + OptionBuilder.hasOptionalArg(); + OptionBuilder.withArgName("your query"); + modes.addOption(OptionBuilder.create("q")); + + modes.addOption(buildOption("h", "help", "Displays all possible parameters.")); + modes.addOption(buildOption("t", "tables", "Displays all tables in the active scheme.")); + + modes.addOption(buildOption("td", "tabledefinition", "Displays the columns of a table.", "the table")); + + modes.addOption(buildOption("ds", "displayscheme", "Displays the active scheme.")); + modes.addOption(buildOption("ch", "check", + "Checks if a table confirms to its definition (for subsets: only primary keys!)", "table")); + modes.addOption(buildOption("dc", "defaultconfig", "Prints the defaultConfiguration.")); + modes.addOption(buildOption("dt", "droptable", "Drops the given table.", "table")); + + modes.addOption(buildOption("lts", "listtableschemas", + "Displays all table schema names in the configuration. The showed name index can be used as value for the -ts option.")); + + modes.setRequired(true); + + options.addOptionGroup(modes); + + // -------------------- OptionGroup for exclusive parameters-------------- + OptionGroup exclusive = new OptionGroup(); + + exclusive.addOption(buildOption("f", "file", + "Set the file used for query, subset creation or partial subset reset.", "file")); + exclusive.addOption(buildOption("o", "online", + "Defines the subset by a PubMed query - remember to wrap it in double quotation marks!", "query")); + exclusive.addOption(buildOption("a", "all", "Use all entries of the _data table for the subset.")); + exclusive.addOption(buildOption("r", "random", + "Generates a random subset, you must provide its size as a parameter. Often used with -z.", "size")); + exclusive.addOption(buildOption("m", "mirror", + "Creates a subset table which mirrors the database table. I.e. when the data table gets new records, the mirror subset(s) will be updated accordingly.")); + exclusive + .addOption(buildOption("w", "where", "Uses a SQL WHERE clause during subset definition.", "condition")); + exclusive.addOption( + buildOption("j", "journals", "Define a subset by providing a file with journal names.", "file")); + exclusive.addOption( + buildOption("l", "limit", "For use with -q. Restricts the number of documents returned.", "limit")); + + + options.addOption(buildOption("he", "has errors", + "Flag for -st(atus) mode to add the 'has errors' statistic to a subset status report.")); + options.addOption(buildOption("isp", "is processed", + "Flag for -st(atus) mode to add the 'is processed' statistic to a subset status report.")); + options.addOption(buildOption("inp", "is in process", + "Flag for -st(atus) mode to add the 'is in process' statistic to a subset status report.")); + options.addOption(buildOption("to", "total", + "Flag for -st(atus) mode to add the 'total' statistic to a subset status report.")); + options.addOption(buildOption("slc", "show last component", + "Flag for -st(atus) mode to add the 'last component' statistic to a subset status report.")); + + + options.addOption(buildOption("np", "not processed", + "Flag for -re(set) mode to restrict to non-processed table rows. May be combined with -ne, -lc.")); + options.addOption(buildOption("ne", "no errors", + "Flag for -re(set) mode to restrict to table rows without errors. May be combined with -np, -lc.")); + options.addOption(buildOption("lc", "last component", + "Option for -re(set) mode to restrict to table rows to a given last component identifier. May be combined with -np, -ne.", + "component name")); + + options.addOptionGroup(exclusive); + + // --------------- optional details for many modes -------------- + options.addOption(buildOption("z", "superset", + "Provides a superset name for definition of a subset or the name of a data table.", + "name of the superset table")); + options.addOption(buildOption("v", "verbose", "Activate verbose informational ouput of the tool's actions")); + + options.addOption(buildOption("d", "delimiter", "Display a line of \"-\" as delimiter between the results.")); + options.addOption(buildOption("pas", "pubmedarticleset", + "For use with -q. The queried documents will be interpreted as Medline XML documents and will be enclosed in PubmedArticleSet.")); + options.addOption(buildOption("out", "out", + "The file or directory where query results are written to. By default, a directory will be created and it will be filled with one file per document. The files will have the name of their database primary key. Modifying parameters:\n" + + "Use -bs to create subdirectories for batches of files.\n" + + "Use -pas to create no directory but a single XML file representing a PubmedArticleSet. This assumes that the queried documents are Medline or Pubmed XML documents.", + "output directory")); + options.addOption(buildOption("bs", "batchsize", + "The number of queried documents (by -q and -out) which should be written in one directory. Subdirectories will be created at need.", + "batchsize")); + options.addOption(buildOption("x", "xpath", + "When querying documents using -q, you may specify one or more XPath expressions to restrict the output to the elements referenced by your XPath expressions. Several XPaths must be delimited by a single comma.", + "xpath")); + options.addOption(buildOption("rh", "referencehops", + "The maximum number of allowed hops to tables referenced with a foreign key when creating subset tables.", + "max number of hops")); + options.addOption(buildOption("ts", "tableschema", + "Table Schema to use; currently only supported by -q mode. The name can be given or the index as retrieved by the -lts mode.", + "schemaname")); + + // -------------------- authentication -------------------- + options.addOption(buildOption("U", "url", + "URL to database server (e.g. jdbc:postgresql:///)", "url")); + options.addOption(buildOption("n", "username", "username for database", "username")); + options.addOption(buildOption("p", "pass", "password for database", "password")); + options.addOption(buildOption("pgs", "pgschema", "Postgres Schema to use", "schema")); + options.addOption(buildOption("srv", "server", "Server name to connect to", "servername")); + options.addOption(buildOption("db", "database", "Database to connect to", "database")); + options.addOption(buildOption("dbc", "databaseconfiguration", + "XML file specifying the user configuration (defaults to dbcConfiguration.xml).", "Config File")); + + return options; + } + + private static Option buildOption(String shortName, String longName, String description, String... arguments) { + OptionBuilder.withLongOpt(longName); + OptionBuilder.withDescription(description); + OptionBuilder.hasArgs(arguments.length); + for (String argument : arguments) + OptionBuilder.withArgName(argument); + return OptionBuilder.create(shortName); + } + + /** + * @param dbc - databaseconnector + * @param tableName - name of the table to check + * @return true - if there was an error, otherwise false + */ + private static boolean checkSchema(DataBaseConnector dbc, String tableName) { + boolean error = false; + String[] tablePath = tableName.split("\\."); + // if the table name has the form 'schemaname.tablename' + if (tablePath.length == 2 && !dbc.withConnectionQueryBoolean(c -> c.schemaExists(tablePath[0]))) + dbc.createSchema(tablePath[0]); + else if (tablePath.length > 2) { + LOG.error(String.format( + "The table path %s is invalid. Only table names of the form 'tablename' or 'schemaname.tablename'are accepted.", + tableName)); + + } + return error; + } + + private static String escapeSingleQuotes(String comment) { + return comment.replaceAll("'", "\\\\'"); + + } + + private static List asListOfArrays(String fileStr) throws IOException { + List list = new ArrayList(); + File file = new File(fileStr); + if (file != null) { + try (BufferedReader br = new BufferedReader(new FileReader(file))) { + String line = br.readLine(); + while (line != null) { + list.add(line.split(KEY_PART_SEPERATOR)); + line = br.readLine(); + } + } + } + return list; + } + + private static ArrayList asList(String fileStr) throws IOException { + ArrayList list = new ArrayList(); + File file = new File(fileStr); + if (file != null) { + try (BufferedReader br = new BufferedReader(new FileReader(file))) { + String line = br.readLine(); + while (line != null) { + list.add(line); + line = br.readLine(); + } + } + } + return list; + } + + private enum Mode { + IMPORT, QUERY, SUBSET, RESET, STATUS, ERROR, TABLES, LIST_TABLE_SCHEMAS, TABLE_DEFINITION, SCHEME, CHECK, DEFAULT_CONFIG, DROP_TABLE, UPDATE_MEDLINE + } + + public static XMLConfiguration loadXmlConfiguration(File configurationFile) throws ConfigurationException { + try { + Parameters params = new Parameters(); + FileBasedConfigurationBuilder configBuilder = + new FileBasedConfigurationBuilder<>(XMLConfiguration.class).configure(params + .xml() + .setFile(configurationFile)); + return configBuilder.getConfiguration(); + } catch (org.apache.commons.configuration2.ex.ConfigurationException e) { + throw new ConfigurationException(e); + } + } +} diff --git a/src/main/java/de/julielab/costosys/cli/ConfigurationNotFoundException.java b/src/main/java/de/julielab/costosys/cli/ConfigurationNotFoundException.java new file mode 100644 index 0000000..044fc4a --- /dev/null +++ b/src/main/java/de/julielab/costosys/cli/ConfigurationNotFoundException.java @@ -0,0 +1,22 @@ +package de.julielab.costosys.cli; + +public class ConfigurationNotFoundException extends Exception { + public ConfigurationNotFoundException() { + } + + public ConfigurationNotFoundException(String message) { + super(message); + } + + public ConfigurationNotFoundException(String message, Throwable cause) { + super(message, cause); + } + + public ConfigurationNotFoundException(Throwable cause) { + super(cause); + } + + public ConfigurationNotFoundException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) { + super(message, cause, enableSuppression, writableStackTrace); + } +} diff --git a/src/main/java/de/julielab/costosys/cli/ExtractDeleteCitations.java b/src/main/java/de/julielab/costosys/cli/ExtractDeleteCitations.java new file mode 100644 index 0000000..0b77a2c --- /dev/null +++ b/src/main/java/de/julielab/costosys/cli/ExtractDeleteCitations.java @@ -0,0 +1,85 @@ +/** + * ExtractDeleteCitations.java + * + * Copyright (c) 2010, JULIE Lab. + * All rights reserved. This program and the accompanying materials + * are made available under the terms of the Common Public License v1.0 + * + * Author: chew + * + * Current version: 1.0 + * Since version: 1.0 + * + * Creation date: 14.12.2010 + **/ + +package de.julielab.costosys.cli; + +import java.io.File; +import java.io.FilenameFilter; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +import de.julielab.costosys.Constants; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import de.julielab.xml.JulieXMLConstants; +import de.julielab.xml.JulieXMLTools; + +/** + * Extracts PMIDs of deleted Medline documents from Medline Update XML batches. + * Currently the path to the XML files is hard coded, this should be made more + * flexible. + * + * @author faessler + */ +public class ExtractDeleteCitations { + + private static final Logger LOG = LoggerFactory + .getLogger(ExtractDeleteCitations.class); + + public static void main(String[] args) { + extractDeletedPMIDs(); + } + + private static void extractDeletedPMIDs() { + LOG.info("Starting extraction..."); + File baseDir = new File("/data/data_corpora/medline/updates"); + if (!baseDir.isDirectory()) { + LOG.error(String.format( + "Path %s does not point to a directory.", + baseDir.getAbsolutePath())); + System.exit(1); + } + String[] fileNames = baseDir.list(new FilenameFilter() { + public boolean accept(File arg0, String arg1) { + return arg1.endsWith(".gz"); + } + }); + + String forEachXpath = "/MedlineCitationSet/DeleteCitation/PMID"; + List> fields = new ArrayList>(); + Map field = new HashMap(); + field.put(JulieXMLConstants.NAME, Constants.PMID_FIELD_NAME); + field.put(JulieXMLConstants.XPATH, + "/MedlineCitationSet/DeleteCitation/PMID"); + fields.add(field); + + int bufferSize = 1000; + for (String fileName : fileNames) { + Iterator> it = JulieXMLTools.constructRowIterator( + baseDir.getAbsolutePath() + "/" + fileName, bufferSize, + forEachXpath, fields, false); + + while (it.hasNext()) { + Map row = it.next(); + String pmid = (String) row.get(Constants.PMID_FIELD_NAME); + System.out.println(pmid); + } + } + } +} diff --git a/src/main/java/de/julielab/costosys/cli/QueryOptions.java b/src/main/java/de/julielab/costosys/cli/QueryOptions.java new file mode 100644 index 0000000..85b33c4 --- /dev/null +++ b/src/main/java/de/julielab/costosys/cli/QueryOptions.java @@ -0,0 +1,26 @@ +package de.julielab.costosys.cli; + +public class QueryOptions { + + public String queryStr; + public String fileStr; + public String tableName; + public boolean useDelimiter; + public String xpath; + public String baseOutDirStr; + public String batchSizeStr; + public String limitStr; + public String whereClause; + public Integer numberRefHops; + public boolean pubmedArticleSet; + public String tableSchema; + @Override + public String toString() { + return "QueryOptions [queryStr=" + queryStr + ", fileStr=" + fileStr + ", tableName=" + tableName + + ", useDelimiter=" + useDelimiter + ", xpath=" + xpath + ", baseOutDirStr=" + baseOutDirStr + + ", batchSizeStr=" + batchSizeStr + ", limitStr=" + limitStr + ", whereClause=" + whereClause + + ", numberRefHops=" + numberRefHops + ", pubmedArticleSet=" + pubmedArticleSet + "]"; + } + + +} diff --git a/src/main/java/de/julielab/costosys/cli/QueryPubMed.java b/src/main/java/de/julielab/costosys/cli/QueryPubMed.java new file mode 100644 index 0000000..6b5b452 --- /dev/null +++ b/src/main/java/de/julielab/costosys/cli/QueryPubMed.java @@ -0,0 +1,85 @@ +package de.julielab.costosys.cli; + +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; +import java.net.URLEncoder; +import java.util.ArrayList; + +import com.ximpleware.AutoPilot; +import com.ximpleware.EOFException; +import com.ximpleware.EncodingException; +import com.ximpleware.EntityException; +import com.ximpleware.NavException; +import com.ximpleware.ParseException; +import com.ximpleware.VTDGen; +import com.ximpleware.VTDNav; +import com.ximpleware.XPathEvalException; +import com.ximpleware.XPathParseException; + +import de.julielab.xml.JulieXMLTools; + +public class QueryPubMed { + private final static String SITE = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"; + private final static String RETMAX = "100000000"; // 5x the size of PubMed (2011) + private final static int BUFFERSIZE = 1024; + private final static String XPATH = "/eSearchResult/IdList/Id"; + + /** + * Query PubMed via REST-API, returning up to 10e8 matched PMIDs. + * Queried terms get expanded, e.g. "Il-1" will match "interleukin-1". + * Searches with really many results (e.g. "cancer") need increased heap space! + * More details: http://eutils.ncbi.nlm.nih.gov/corehtml/query/static/esearch_help.html + * + * @param query -Query for PubMed as a String + * @return - ArrayList, containing PMIDs as Strings + */ + public static ArrayList query(String query) { + ArrayList ids = new ArrayList(); + try { + StringBuilder queryBuilder = new StringBuilder(); + queryBuilder.append(SITE) + .append("?term=").append(URLEncoder.encode(query, "UTF-8")) + .append("&retmax=").append(RETMAX).append("&tool=julie-medline-manager") + .append("&email=julielab@listserv.uni-jena.de"); + URL url = new URL(queryBuilder.toString()); + InputStream stream = url.openStream(); + + VTDGen vg = new VTDGen(); // Parses XML + vg.setDoc(JulieXMLTools.readStream(stream, BUFFERSIZE)); + vg.parse(true); + VTDNav vn = vg.getNav(); // Navigates in parsed XML + AutoPilot ap = new AutoPilot(vn); // Moves through whole XML + + ap.selectXPath(XPATH); + while (ap.evalXPath() != -1) { + // 32 bits encoding length, 32 bits encoding offset + long fragment = vn.getContentFragment(); + // right 32 bits + int offset = (int) fragment; + // left 32 bits, casts priority is higher than right-shifts + int length = (int) (fragment >> 32); + ids.add(vn.toString(offset, length)); + } + + } catch (IOException e) { + e.printStackTrace(); + } catch (EncodingException e) { + e.printStackTrace(); + } catch (EOFException e) { + e.printStackTrace(); + } catch (EntityException e) { + e.printStackTrace(); + } catch (ParseException e) { + e.printStackTrace(); + } catch (XPathParseException e) { + e.printStackTrace(); + } catch (XPathEvalException e) { + e.printStackTrace(); + } catch (NavException e) { + e.printStackTrace(); + } + + return ids; + } +} \ No newline at end of file diff --git a/src/main/java/de/julielab/costosys/cli/TableNotFoundException.java b/src/main/java/de/julielab/costosys/cli/TableNotFoundException.java new file mode 100644 index 0000000..d62fc2d --- /dev/null +++ b/src/main/java/de/julielab/costosys/cli/TableNotFoundException.java @@ -0,0 +1,16 @@ +package de.julielab.costosys.cli; + +public class TableNotFoundException extends Exception { + /** + * + */ + private static final long serialVersionUID = -4868103716490551991L; + + public TableNotFoundException() { + super(); + } + + public TableNotFoundException(String message) { + super(message); + } +} diff --git a/src/main/java/de/julielab/costosys/cli/Test.java b/src/main/java/de/julielab/costosys/cli/Test.java new file mode 100644 index 0000000..57cf2a3 --- /dev/null +++ b/src/main/java/de/julielab/costosys/cli/Test.java @@ -0,0 +1,20 @@ +package de.julielab.costosys.cli; + +import java.io.*; +import java.nio.file.FileSystem; +import java.nio.file.FileSystems; +import java.nio.file.Paths; +import java.nio.file.StandardOpenOption; + +public class Test { + public static void main(String args[]) throws IOException { + try (FileSystem fs = FileSystems.newFileSystem(Paths.get("myfs.zip"), null)) { + + OutputStream os = fs.provider().newOutputStream(fs.getPath("mow/entry.txt"), StandardOpenOption.CREATE, StandardOpenOption.WRITE); + try (BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(os))) { + bw.write("Here is content!"); + } + } + System.out.println("Done"); + } +} diff --git a/src/main/java/de/julielab/xmlData/config/ConfigBase.java b/src/main/java/de/julielab/costosys/configuration/ConfigBase.java similarity index 97% rename from src/main/java/de/julielab/xmlData/config/ConfigBase.java rename to src/main/java/de/julielab/costosys/configuration/ConfigBase.java index ccffaf4..02c0148 100644 --- a/src/main/java/de/julielab/xmlData/config/ConfigBase.java +++ b/src/main/java/de/julielab/costosys/configuration/ConfigBase.java @@ -1,4 +1,4 @@ -package de.julielab.xmlData.config; +package de.julielab.costosys.configuration; import java.io.IOException; @@ -11,7 +11,7 @@ import com.ximpleware.VTDNav; import com.ximpleware.XPathParseException; -import de.julielab.xmlData.dataBase.DataBaseConnector; +import de.julielab.costosys.dbconnection.DataBaseConnector; diff --git a/src/main/java/de/julielab/costosys/configuration/ConfigReader.java b/src/main/java/de/julielab/costosys/configuration/ConfigReader.java new file mode 100644 index 0000000..3c88dd1 --- /dev/null +++ b/src/main/java/de/julielab/costosys/configuration/ConfigReader.java @@ -0,0 +1,452 @@ +/** + * ConfigurationParser.java + * + * Copyright (c) 2011, JULIE Lab. + * All rights reserved. This program and the accompanying materials + * are made available under the terms of the Common Public License v1.0 + * + * Author: faessler/hellrich + * + * Current version: 1.0 + * Since version: 1.0 + * + * Creation date: 22.03.2011 + **/ + +package de.julielab.costosys.configuration; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.util.*; + +import org.apache.commons.io.IOUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.ximpleware.AutoPilot; +import com.ximpleware.NavException; +import com.ximpleware.VTDException; +import com.ximpleware.VTDGen; +import com.ximpleware.VTDNav; +import com.ximpleware.XMLModifier; +import com.ximpleware.XPathEvalException; +import com.ximpleware.XPathParseException; + +import de.julielab.xml.JulieXMLTools; + +/** + * This class reads an xml configuration file, containing the definition of a + * database connection and the fields used in the database. It provides those + * definitions as specialized objects. + * + * @author hellrich + */ +public class ConfigReader { + + private static final Logger LOG = LoggerFactory + .getLogger(ConfigReader.class); + private final static int BUFFER_SIZE = 1000; + public static final String DEFAULT_DEFINITION = "/defaultConfiguration.xml"; + + public static final String XPATH_ACTIVE_TABLE_SCHEMA = "//activeTableSchema"; + public static final String XPATH_ACTIVE_DB = "//activeDBConnection"; + public static final String XPATH_ACTIVE_PG_SCHEMA = "//activePostgresSchema"; + public static final String XPATH_MAX_CONNS = "//maxActiveDBConnections"; + + public static final String XPATH_CONF_DBS = "//DBConnections"; + public static final String XPATH_CONF_SCHEMAS = "//tableSchemas"; + + public static final String XPATH_CONF_DB = "//DBConnection"; + public static final String XPATH_CONF_SCHEMA = "//tableSchema"; + + private static final int INDEX_SCHEMA = 0; + private static final int INDEX_DB = 1; + private static final int INDEX_PG_SCHEMA = 2; + private static final int INDEX_MAX_CONNS = 3; + private static final int INDEX_DATA_TABLE = 4; + private static final int INDEX_DATA_SCHEMA = 5; + + private static final String XPATH_DATA_TABLE = "//dataTable"; + private static final String XPATH_DATA_SCHEMA = "//activeDataPostgresSchema"; + private static final String ATTRIBUTE_NAME = "name"; + + private FieldConfigurationManager fieldConfigs; + private DBConfig dbConf; + private String activeDataTable; + private String activeSchemaName; + private byte[] mergedConfigData; + private String activeDataSchema; + private List schemaNames; + + public ConfigReader(InputStream def) { + try { + byte[] defaultConfData = null; + byte[] userConfData = null; + + InputStream is = getClass().getResourceAsStream(DEFAULT_DEFINITION); + defaultConfData = IOUtils.toByteArray(is); + is.close(); + // check if the user gave a table schema definition; + // if not, the default values will be used + if (def != null) { + userConfData = IOUtils.toByteArray(def); + def.close(); + } + mergedConfigData = mergeConfigData(defaultConfData, userConfData); + + schemaNames = getAllSchemaNames(mergedConfigData); + + // Creating + fieldConfigs = new FieldConfigurationManager(); + for (String schemaName : schemaNames) + fieldConfigs.put(schemaName, new FieldConfig(mergedConfigData, + schemaName)); + dbConf = new DBConfig(mergedConfigData); + activeDataTable = ConfigBase.getActiveConfig(mergedConfigData, + XPATH_DATA_TABLE); + activeDataSchema = ConfigBase.getActiveConfig(mergedConfigData, + XPATH_DATA_SCHEMA); + activeSchemaName = ConfigBase.getActiveConfig(mergedConfigData, + XPATH_ACTIVE_TABLE_SCHEMA); + + LOG.debug("Active data table: {}", activeDataTable); + LOG.debug("Active Postgres data schema: {}", activeDataSchema); + LOG.debug("Active table schema: {}", activeSchemaName); + } catch (IOException e) { + e.printStackTrace(); + } catch (VTDException e) { + LOG.error("Parsing of configuration file failed:", e); + } + + } + + /** + * @param mergedConfigData + * @return + */ + private List getAllSchemaNames(byte[] mergedConfigData) + throws VTDException { + List schemaNames = new ArrayList(); + + VTDGen vg = new VTDGen(); + vg.setDoc(mergedConfigData); + vg.parse(true); + VTDNav vn = vg.getNav(); + // Navigates through schema elements + AutoPilot schemaAP = new AutoPilot(vn); + schemaAP.selectXPath(XPATH_CONF_SCHEMA); + // Returns the name attribute value for current schema, navigated to by + // schemaAP. + AutoPilot schemaNameAP = new AutoPilot(vn); + schemaNameAP.selectXPath("@name"); + + while (schemaAP.evalXPath() != -1) + schemaNames.add(schemaNameAP.evalXPathToString()); + + return schemaNames; + } + + /** + * Inserts the user schemes into the default configuration. This makes all + * data available in one place, which is useful for referencing default + * values from within a user configuration. + * + * @param defaultConfData + * - prepared default configuration file + * @param userConfData + * - prepared user specific configuration file + * @return - the merged configuration + * @throws VTDException + * @throws IOException + */ + protected static byte[] mergeConfigData(byte[] defaultConfData, + byte[] userConfData) throws VTDException, IOException { + VTDGen vg = new VTDGen(); + vg.setDoc(defaultConfData); + vg.parse(true); + VTDNav vn = vg.getNav(); + AutoPilot ap = new AutoPilot(vn); + + if (userConfData == null) { + return defaultConfData; + //throw new IllegalArgumentException("No CoStoSys user configuration was passed."); + } + + XMLModifier xm = new XMLModifier(vn); + + // Get user defined schema and DB connection data. + byte[][] userDefs = extractConfigData(userConfData); + + // Add schema data to the default configuration. + if (userDefs[INDEX_SCHEMA] != null) { + ap.selectXPath(XPATH_CONF_SCHEMA); + if (ap.evalXPath() != -1) { + xm.insertAfterElement(userDefs[INDEX_SCHEMA]); + } + } + + // Add DB connection data to the default configuration. + if (userDefs[INDEX_DB] != null) { + ap.selectXPath(XPATH_CONF_DB); + if (ap.evalXPath() != -1) { + xm.insertAfterElement(userDefs[INDEX_DB]); + } + } + + // Get active table schema, active data postgres schema, active DB + // connection and more + // from the user configuration, if these declarations exist. + String[] activeConfs = getActiveConfigurations(userConfData); + LOG.debug("Found the following active configurations in the user data: {}", Arrays.toString(activeConfs)); + + // Insert the active configurations into the merged configuration, thus + // overwriting the defaults. + if (activeConfs[INDEX_SCHEMA].length() > 0) { + int newTextIndex = JulieXMLTools.setElementText(vn, ap, xm, + XPATH_ACTIVE_TABLE_SCHEMA, activeConfs[INDEX_SCHEMA]); + LOG.trace("Set the active table schema to {}. Returned new index: {}", activeConfs[INDEX_SCHEMA], newTextIndex); + if (newTextIndex == -1) { + throw new IllegalStateException( + "There is no active table schema defined. Please define an active table schema in your user " + + " configuration. The user configuration is: " + new String(userConfData, StandardCharsets.UTF_8)); + } + } + if (activeConfs[INDEX_DB].length() > 0) { + int newTextIndex = JulieXMLTools.setElementText(vn, ap, xm, + XPATH_ACTIVE_DB, activeConfs[INDEX_DB]); + if (newTextIndex == -1) { +// throw new IllegalStateException( +// "Unexpected error: The default configuration does not define an active database connection. Please define an active DB connection in your user configuration."); + LOG.warn("The default configuration does not define an active database connection."); + } + + } + if (activeConfs[INDEX_PG_SCHEMA].length() > 0) { + int newTextIndex = JulieXMLTools.setElementText(vn, ap, xm, + XPATH_ACTIVE_PG_SCHEMA, activeConfs[INDEX_PG_SCHEMA]); + if (newTextIndex == -1) + throw new IllegalStateException( + "Unexpected error: The default configuration does not define an active Postgres schema. Please define an active Postgres schema in your user configuration."); + } + if (activeConfs[INDEX_MAX_CONNS].length() > 0) { + int newTextIndex = JulieXMLTools.setElementText(vn, ap, xm, + XPATH_MAX_CONNS, activeConfs[INDEX_MAX_CONNS]); + if (newTextIndex == -1) { +// throw new IllegalStateException( +// "Unexpected error: The default configuration does not define a maximal number of database connections. Please define a maximal number of connections in your user configuration."); + LOG.warn("Unexpected error: The default configuration does not define a maximal number of database connections"); + } + } + if (activeConfs[INDEX_DATA_TABLE].length() > 0) { + int newTextIndex = JulieXMLTools.setElementText(vn, ap, xm, + XPATH_DATA_TABLE, activeConfs[INDEX_DATA_TABLE]); + if (newTextIndex == -1) + throw new IllegalStateException( + "Unexpected error: The default configuration does not define a _data table. Please define a _data table in your user configuration."); + } + + if (activeConfs[INDEX_DATA_SCHEMA].length() > 0) { + int newTextIndex = JulieXMLTools.setElementText(vn, ap, xm, + XPATH_DATA_SCHEMA, activeConfs[INDEX_DATA_SCHEMA]); + if (newTextIndex == -1) + throw new IllegalStateException( + "Unexpected error: The default configuration does not define an active data Postgres schema. Please define a data postgres schema in your user configuration."); + } + + // Test validity of merged xml (no doublets) + vn = xm.outputAndReparse(); + + String doublet = getDoublet(vn, XPATH_CONF_SCHEMA); + if (doublet != null) + throw new IllegalStateException( + "Unexpected error: You may not define " + + doublet + + "as this schema is already defined in the default configuration!"); + + doublet = getDoublet(vn, XPATH_CONF_DB); + if (doublet != null) + throw new IllegalStateException( + "Unexpected error: You may not define " + + doublet + + "as this connection is already defined in the default configuration!"); + + // Return the merged configuration data. + ByteArrayOutputStream os = new ByteArrayOutputStream(); + xm.output(os); + return os.toByteArray(); + } + + private static String getDoublet(VTDNav vn, String xpath) { + String doublet = ""; + AutoPilot ap = new AutoPilot(vn); + try { + ap.selectXPath(xpath); + int index = ap.evalXPath(); + String name = null; + Set found = new HashSet(); + while (index != -1) { + int attrIndex = vn.getAttrVal(ATTRIBUTE_NAME); + if (attrIndex != -1) { + name = vn.toString(attrIndex); + if (found.contains(name)) + doublet = doublet.concat(name).concat(", "); + else + found.add(name); + } + index = ap.evalXPath(); + } + } catch (XPathParseException e) { + e.printStackTrace(); + } catch (XPathEvalException e) { + e.printStackTrace(); + } catch (NavException e) { + e.printStackTrace(); + } + return doublet.equals("") ? null : doublet; + } + + /** + * Extracts the active configuration names (e.g. table schema) from the + * configuration data given by confData. Returns an array of + * active configuration names where the position of a specific active + * configuration name in the array is determined by the constants + * INDEX_SCHEMA, INDEX_DB etc. + * + * @param confData + * Configuration data to extract active configuration names from. + * @return A String array with the names of active configurations. The Array + * is index by the INDEX_XXX constants. + * @throws VTDException + * If something concerning the parsing and value extraction goes + * wrong. + */ + private static String[] getActiveConfigurations(byte[] confData) + throws VTDException { + VTDGen vg = new VTDGen(); + vg.setDoc(confData); + vg.parse(true); + VTDNav vn = vg.getNav(); + AutoPilot ap = new AutoPilot(vn); + + String[] activeConfigurations = new String[6]; + ap.selectXPath(XPATH_ACTIVE_PG_SCHEMA); + activeConfigurations[INDEX_PG_SCHEMA] = ap.evalXPathToString(); + + ap.selectXPath(XPATH_ACTIVE_TABLE_SCHEMA); + activeConfigurations[INDEX_SCHEMA] = ap.evalXPathToString(); + + ap.selectXPath(XPATH_ACTIVE_DB); + activeConfigurations[INDEX_DB] = ap.evalXPathToString(); + + + ap.selectXPath(XPATH_MAX_CONNS); + activeConfigurations[INDEX_MAX_CONNS] = ap.evalXPathToString(); + + ap.selectXPath(XPATH_DATA_TABLE); + activeConfigurations[INDEX_DATA_TABLE] = ap.evalXPathToString(); + + ap.selectXPath(XPATH_DATA_SCHEMA); + activeConfigurations[INDEX_DATA_SCHEMA] = ap.evalXPathToString(); + + return activeConfigurations; + } + + /** + * Retrieves XML elements (determined by the used path) from the + * configuration. + * + * @param confData + * - prepared XML + * @return - the retrieved element + * @throws IOException + * @throws VTDException + */ + protected static byte[][] extractConfigData(byte[] confData) + throws IOException, VTDException { + // Allocate space for schema and DB connection data. + byte[][] configData = new byte[2][]; + VTDGen vg = new VTDGen(); + vg.setDoc(confData); + vg.parse(true); + VTDNav vn = vg.getNav(); + AutoPilot ap = new AutoPilot(vn); + + // Get schema definition data. + ap.selectXPath(XPATH_CONF_SCHEMAS); + + if (ap.evalXPath() != -1) { + String fragment = JulieXMLTools.getFragment(vn, JulieXMLTools.CONTENT_FRAGMENT, + true); + configData[INDEX_SCHEMA] = fragment.getBytes(); + } + + // Get database connection data. + ap.selectXPath(XPATH_CONF_DBS); + + if (ap.evalXPath() != -1) { + String fragment = JulieXMLTools.getFragment(vn, JulieXMLTools.CONTENT_FRAGMENT, + true); + configData[INDEX_DB] = fragment.getBytes(); + } + + return configData; + } + + /** + * Accessing the Database Configuration + * + * @return - DatabaseConfig Object + */ + public DBConfig getDatabaseConfig() { + return dbConf; + } + + /** + *

+ * Accessing the Field Definitions. + *

+ *

+ * The returned map consists of pairs in the form + * (schemaName, fieldConfig) where schemaName is + * the name of the table schema represented by fieldConfig. + *

+ * + * @return - A map containing all table schemas in the default and in the + * user configuration. + */ + public FieldConfigurationManager getFieldConfigs() { + return fieldConfigs; + } + + /** + * @return the activeDataTable + */ + public String getActiveDataTable() { + return activeDataTable; + } + + public String getActiveDataSchema() { + return activeDataSchema; + } + + /** + * @return the activeSchemaName + */ + public String getActiveSchemaName() { + return activeSchemaName; + } + + /** + * @return the mergedConfigData + */ + public byte[] getMergedConfigData() { + return mergedConfigData; + } + + public List getTableSchemaNames() { + return schemaNames; + } + +} diff --git a/src/main/java/de/julielab/xmlData/config/DBConfig.java b/src/main/java/de/julielab/costosys/configuration/DBConfig.java similarity index 99% rename from src/main/java/de/julielab/xmlData/config/DBConfig.java rename to src/main/java/de/julielab/costosys/configuration/DBConfig.java index c8e6fc0..5168137 100644 --- a/src/main/java/de/julielab/xmlData/config/DBConfig.java +++ b/src/main/java/de/julielab/costosys/configuration/DBConfig.java @@ -1,4 +1,4 @@ -package de.julielab.xmlData.config; +package de.julielab.costosys.configuration; import java.io.IOException; diff --git a/src/main/java/de/julielab/xmlData/config/FieldConfig.java b/src/main/java/de/julielab/costosys/configuration/FieldConfig.java similarity index 99% rename from src/main/java/de/julielab/xmlData/config/FieldConfig.java rename to src/main/java/de/julielab/costosys/configuration/FieldConfig.java index a6c8532..d0b4e0a 100644 --- a/src/main/java/de/julielab/xmlData/config/FieldConfig.java +++ b/src/main/java/de/julielab/costosys/configuration/FieldConfig.java @@ -13,7 +13,7 @@ * Creation date: 11.03.2011 **/ -package de.julielab.xmlData.config; +package de.julielab.costosys.configuration; import java.util.ArrayList; import java.util.HashMap; @@ -22,6 +22,7 @@ import java.util.Map; import java.util.stream.Stream; +import de.julielab.costosys.Constants; import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -41,7 +42,6 @@ import de.julielab.xml.JulieXMLConstants; import de.julielab.xml.JulieXMLTools; -import de.julielab.xmlData.Constants; /** * This class holds the definition of fields for the database table to work diff --git a/src/main/java/de/julielab/costosys/configuration/FieldConfigurationManager.java b/src/main/java/de/julielab/costosys/configuration/FieldConfigurationManager.java new file mode 100644 index 0000000..5719d71 --- /dev/null +++ b/src/main/java/de/julielab/costosys/configuration/FieldConfigurationManager.java @@ -0,0 +1,64 @@ +/** + * FieldConfigurationManager.java + * + * Copyright (c) 2013, JULIE Lab. + * All rights reserved. This program and the accompanying materials + * are made available under the terms of the Common Public License v1.0 + * + * Author: faessler + * + * Current version: 1.0 + * Since version: 1.0 + * + * Creation date: 01.02.2013 + **/ + +/** + * + */ +package de.julielab.costosys.configuration; + +import java.util.HashMap; + +import org.apache.commons.lang3.StringUtils; + +/** + *

+ * This class is essentially a HashMap. + *

+ *

+ * It maps table schema names defined in the default or user provided + * configuration to the {@link FieldConfig} objects modeling these schemas. This + * class adds some minor validity checks to the default map methods. + *

+ * + * @author faessler + * + */ +public class FieldConfigurationManager extends HashMap { + + /** + * + */ + private static final long serialVersionUID = -6516109594561720970L; + + /* + * (non-Javadoc) + * + * @see java.util.HashMap#get(java.lang.Object) + */ + @Override + public FieldConfig get(Object key) { + if (null == key || StringUtils.isBlank(key.toString())) + throw new TableSchemaDoesNotExistException( + "The name of the table schema to fetch was null."); + + FieldConfig fieldConfig = super.get(key); + if (null == fieldConfig) { + throw new TableSchemaDoesNotExistException("The requested table schema definition \"" + key + + "\" is not defined in the default configuration or the user provided configuration."); + } + return fieldConfig; + } + +} diff --git a/src/main/java/de/julielab/hiddenConfig/HiddenConfig.java b/src/main/java/de/julielab/costosys/configuration/HiddenConfig.java similarity index 95% rename from src/main/java/de/julielab/hiddenConfig/HiddenConfig.java rename to src/main/java/de/julielab/costosys/configuration/HiddenConfig.java index 74a3814..cc1f8ca 100644 --- a/src/main/java/de/julielab/hiddenConfig/HiddenConfig.java +++ b/src/main/java/de/julielab/costosys/configuration/HiddenConfig.java @@ -1,4 +1,4 @@ -package de.julielab.hiddenConfig; +package de.julielab.costosys.configuration; import java.io.BufferedReader; import java.io.File; @@ -13,7 +13,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import de.julielab.xmlData.Constants; +import de.julielab.costosys.Constants; /** * This class reads a hidden configuration file in the users home directory. If no such file exists, a new one can be @@ -35,7 +35,7 @@ public class HiddenConfig { private File configFile; /** - * Reads a hidden config file in the users home directory. + * Reads a hidden configuration file in the users home directory. */ public HiddenConfig() { String home = System.getProperty("user.home"); @@ -101,7 +101,7 @@ else if (homeDir.exists()) } /** - * @return - The username in the hidden config file + * @return - The username in the hidden configuration file * */ public String getUsername(String DBConnectionName) { @@ -109,7 +109,7 @@ public String getUsername(String DBConnectionName) { } /** - * @return - The password in the hidden config file + * @return - The password in the hidden configuration file * */ public String getPassword(String DBConnectionName) { diff --git a/src/main/java/de/julielab/xmlData/config/TableSchemaDoesNotExistException.java b/src/main/java/de/julielab/costosys/configuration/TableSchemaDoesNotExistException.java similarity index 93% rename from src/main/java/de/julielab/xmlData/config/TableSchemaDoesNotExistException.java rename to src/main/java/de/julielab/costosys/configuration/TableSchemaDoesNotExistException.java index 3c7390c..f806f4b 100644 --- a/src/main/java/de/julielab/xmlData/config/TableSchemaDoesNotExistException.java +++ b/src/main/java/de/julielab/costosys/configuration/TableSchemaDoesNotExistException.java @@ -16,7 +16,7 @@ /** * */ -package de.julielab.xmlData.config; +package de.julielab.costosys.configuration; /** * @author faessler diff --git a/src/main/java/de/julielab/xmlData/dataBase/CoStoSysConnection.java b/src/main/java/de/julielab/costosys/dbconnection/CoStoSysConnection.java similarity index 95% rename from src/main/java/de/julielab/xmlData/dataBase/CoStoSysConnection.java rename to src/main/java/de/julielab/costosys/dbconnection/CoStoSysConnection.java index 5a1e57e..be3ff2c 100644 --- a/src/main/java/de/julielab/xmlData/dataBase/CoStoSysConnection.java +++ b/src/main/java/de/julielab/costosys/dbconnection/CoStoSysConnection.java @@ -1,6 +1,6 @@ -package de.julielab.xmlData.dataBase; +package de.julielab.costosys.dbconnection; -import de.julielab.xmlData.dataBase.util.CoStoSysSQLRuntimeException; +import de.julielab.costosys.dbconnection.util.CoStoSysSQLRuntimeException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/src/main/java/de/julielab/xmlData/dataBase/ConnectionClosable.java b/src/main/java/de/julielab/costosys/dbconnection/ConnectionClosable.java similarity index 61% rename from src/main/java/de/julielab/xmlData/dataBase/ConnectionClosable.java rename to src/main/java/de/julielab/costosys/dbconnection/ConnectionClosable.java index 0a05ae6..0df0c88 100644 --- a/src/main/java/de/julielab/xmlData/dataBase/ConnectionClosable.java +++ b/src/main/java/de/julielab/costosys/dbconnection/ConnectionClosable.java @@ -1,4 +1,4 @@ -package de.julielab.xmlData.dataBase; +package de.julielab.costosys.dbconnection; public interface ConnectionClosable { void closeConnection(); diff --git a/src/main/java/de/julielab/xmlData/dataBase/DBCIterator.java b/src/main/java/de/julielab/costosys/dbconnection/DBCIterator.java similarity index 95% rename from src/main/java/de/julielab/xmlData/dataBase/DBCIterator.java rename to src/main/java/de/julielab/costosys/dbconnection/DBCIterator.java index 20f3c62..9125738 100644 --- a/src/main/java/de/julielab/xmlData/dataBase/DBCIterator.java +++ b/src/main/java/de/julielab/costosys/dbconnection/DBCIterator.java @@ -16,7 +16,7 @@ /** * */ -package de.julielab.xmlData.dataBase; +package de.julielab.costosys.dbconnection; import java.util.Iterator; diff --git a/src/main/java/de/julielab/xmlData/dataBase/DBCThreadedIterator.java b/src/main/java/de/julielab/costosys/dbconnection/DBCThreadedIterator.java similarity index 92% rename from src/main/java/de/julielab/xmlData/dataBase/DBCThreadedIterator.java rename to src/main/java/de/julielab/costosys/dbconnection/DBCThreadedIterator.java index b2969f9..74731d6 100644 --- a/src/main/java/de/julielab/xmlData/dataBase/DBCThreadedIterator.java +++ b/src/main/java/de/julielab/costosys/dbconnection/DBCThreadedIterator.java @@ -1,6 +1,6 @@ -package de.julielab.xmlData.dataBase; +package de.julielab.costosys.dbconnection; -import de.julielab.xmlData.dataBase.util.CoStoSysSQLRuntimeException; +import de.julielab.costosys.dbconnection.util.CoStoSysSQLRuntimeException; import java.util.Iterator; import java.util.List; diff --git a/src/main/java/de/julielab/costosys/dbconnection/DataBaseConnector.java b/src/main/java/de/julielab/costosys/dbconnection/DataBaseConnector.java new file mode 100644 index 0000000..e9f0a40 --- /dev/null +++ b/src/main/java/de/julielab/costosys/dbconnection/DataBaseConnector.java @@ -0,0 +1,3995 @@ +package de.julielab.costosys.dbconnection; + +import com.google.common.cache.CacheBuilder; +import com.google.common.cache.CacheLoader; +import com.google.common.cache.LoadingCache; +import com.zaxxer.hikari.HikariConfig; +import com.zaxxer.hikari.HikariDataSource; +import com.zaxxer.hikari.HikariPoolMXBean; +import de.julielab.costosys.Constants; +import de.julielab.costosys.cli.TableNotFoundException; +import de.julielab.costosys.configuration.*; +import de.julielab.costosys.dbconnection.util.*; +import de.julielab.xml.JulieXMLConstants; +import de.julielab.xml.JulieXMLTools; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.tuple.ImmutablePair; +import org.apache.commons.lang3.tuple.Pair; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.management.JMX; +import javax.management.MBeanServer; +import javax.management.ObjectName; +import java.io.*; +import java.lang.management.ManagementFactory; +import java.sql.*; +import java.util.*; +import java.util.Map.Entry; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.ExecutionException; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import java.util.stream.Stream; + +/** + * This class creates a connection with a database and allows for convenient + * queries and commands.
+ * Database layout and returned columns are specified by a configuration file. + * The class was developed for a PostgreSQL back-end, using another database + * server may require modifications.
+ * Queries use up to 3 threads for higher performance and a connection pool is + * used for higher performance if multiple instances are deployed simultaneous. + *

+ * Visit + * http://commons.apache.org/dbcp/apidocs/org/apache/commons/dbcp/package- + * summary.html#package_description<\code> for more information about the + * connection pooling. + * + * @author hellrich, faessler + */ +public class DataBaseConnector { + + public static final String DEFAULT_PIPELINE_STATE = ""; + /** + * Used as a hack for the not-yet-published EMNLP-Paper. In the meantime, a more + * sophisticated system has been implemented (EF, 18.01.2012) + */ + @Deprecated + public static final int META_IN_ARRAY = 2; + /** + * This is the definition of subset tables except the primary key. + */ + public static final LinkedHashMap subsetColumns; + /** + * Size of the batches used for data retrieval from the database, value is + * optimized for xml-clobs in postgres on 2010 hardware. + */ + private static final int DEFAULT_QUERY_BATCH_SIZE = 1000; + /** + * Size of the byte buffer used for reading xml into vtd (xml parser) + */ + private final static int BUFFER_SIZE = 1000; + private static final String DEFAULT_FIELD = "xml"; + private static final String DEFAULT_TABLE = Constants.DEFAULT_DATA_TABLE_NAME; + private static final int commitBatchSize = 100; + private static final int RETRIEVE_MARK_LIMIT = 1000; + private static final int ID_SUBLIST_SIZE = 1000; + private static final Map pools = new ConcurrentHashMap<>(); + /** + * A set of field definitions read from a configuration XML file. Contains the + * name of each field as well as a source for the field's value. + */ + // private FieldConfig fieldConfig; + // For import + private static Logger LOG = LoggerFactory.getLogger(DataBaseConnector.class); + private static Thread commitThread = null; + private static LoadingCache> connectionCache = CacheBuilder + .newBuilder() + // The weak keys are the main reason to use the cache. It allows to garbage collect the threads + // that have reserved connections and did never release them. Those threads would be held in memory + // when we used strong references which would be a memory leak. + .weakKeys() + .build(new CacheLoader>() { + @Override + public List load(Thread thread) { + return new ArrayList<>(); + } + }); + private static HikariDataSource dataSource; + + static { + subsetColumns = new LinkedHashMap<>(); + subsetColumns.put(Constants.LOG, "text"); + subsetColumns.put(Constants.IS_PROCESSED, "boolean DEFAULT false"); + subsetColumns.put(Constants.IN_PROCESS, "boolean DEFAULT false"); + subsetColumns.put(Constants.LAST_COMPONENT, "text DEFAULT '" + DEFAULT_PIPELINE_STATE + "'"); + subsetColumns.put(Constants.HAS_ERRORS, "boolean DEFAULT false"); + subsetColumns.put(Constants.PID, "character varying(10)"); + subsetColumns.put(Constants.HOST_NAME, "character varying(100)"); + subsetColumns.put(Constants.PROCESSING_TIMESTAMP, "timestamp without time zone"); + } + + /** + * Sometimes it is necessary to manage multiple data tables with different field + * schemas. fieldConfigs contains all field schema names in the configuration, + * mapped to the corresponding FieldConfig instance. + */ + private FieldConfigurationManager fieldConfigs; + private DBConfig dbConfig; + private String activeDataSchema; + private String activeDataTable; + private String activeTableSchema; + private byte[] effectiveConfiguration; + private int queryBatchSize = DEFAULT_QUERY_BATCH_SIZE; + private String dbURL; + private String user; + private String password; + private ConfigReader config; + + /************************************************************************** + *************************** Constructors ******************************** + **************************************************************************/ + + + public DataBaseConnector(String configPath) throws FileNotFoundException { + this(findConfigurationFile(configPath)); + } + + /** + * This class creates a connection with a database and allows for convenient + * queries and commands. + * + * @param configStream used to read the configuration for this connector instance + */ + public DataBaseConnector(InputStream configStream) { + config = new ConfigReader(configStream); + dbConfig = config.getDatabaseConfig(); + this.dbURL = dbConfig.getUrl(); + this.fieldConfigs = config.getFieldConfigs(); + this.activeDataSchema = config.getActiveDataSchema(); + this.activeDataTable = config.getActiveDataTable().contains(".") ? config.getActiveDataTable() : this.activeDataSchema + "." + config.getActiveDataTable(); + this.activeTableSchema = config.getActiveSchemaName(); + this.effectiveConfiguration = config.getMergedConfigData(); + + if (!StringUtils.isBlank(dbConfig.getActiveDatabase()) && (StringUtils.isBlank(user) || StringUtils.isBlank(password))) { + HiddenConfig hc = new HiddenConfig(); + this.user = hc.getUsername(dbConfig.getActiveDatabase()); + this.password = hc.getPassword(dbConfig.getActiveDatabase()); + LOG.info("Connecting to " + this.dbURL + " as " + this.user); + } else { + LOG.warn( + "No active database configured in configuration file or configuration file is empty or does not exist."); + } + LOG.info("Active Postgres schema: {}", dbConfig.getActivePGSchema() ); + LOG.info("Active data Postgres schema: {}", dbConfig.getActiveDataPGSchema() ); + } + + /** + * This class creates a connection with a database and allows for convenient + * queries and commands. + * + * @param configStream used to read the configuration for this connector instance + * @param queryBatchSize background threads are utilized to speed up queries, this + * parameter determines the number of pre-fetched entries + */ + public DataBaseConnector(InputStream configStream, int queryBatchSize) { + this(configStream); + this.queryBatchSize = queryBatchSize; + } + + /** + * This class creates a connection with a database and allows for convenient + * queries and commands. + * + * @param dbUrl the url of the database + * @param user the username for the db + * @param password the password for the username + * @param fieldDefinition InputStream containing data of a configuration file + */ + public DataBaseConnector(String dbUrl, String user, String password, String pgSchema, InputStream fieldDefinition) { + this(dbUrl, user, password, pgSchema, DEFAULT_QUERY_BATCH_SIZE, fieldDefinition); + } + + public DataBaseConnector(String serverName, String dbName, String user, String password, String pgSchema, + InputStream fieldDefinition) { + this(serverName, dbName, user, password, pgSchema, DEFAULT_QUERY_BATCH_SIZE, fieldDefinition); + } + + /** + * This class creates a connection with a database and allows for convenient + * queries and commands. + * + * @param dbUrl the url of the database + * @param user the username for the db + * @param password the password for the username + * @param queryBatchSize background threads are utilized to speed up queries, this + * parameter determines the number of pre-fetched entries + * @param configStream used to read the configuration for this connector instance + */ + public DataBaseConnector(String dbUrl, String user, String password, String pgSchema, int queryBatchSize, + InputStream configStream) { + this(configStream, queryBatchSize); + // Manually entered values have priority. + setCredentials(dbUrl, user, password, pgSchema); + } + + public DataBaseConnector(String serverName, String dbName, String user, String password, String pgSchema, + int queryBatchSize, InputStream configStream) { + this(configStream, queryBatchSize); + // Manually entered values have priority. + String dbUrl = null; + if (dbName != null && serverName != null) + dbUrl = "jdbc:postgresql://" + serverName + ":5432/" + dbName; + else { + if (dbName != null) + dbUrl = dbConfig.getUrl().replaceFirst("/[^/]+$", "/" + dbName); + if (serverName != null) + dbUrl = dbConfig.getUrl().replaceFirst("(.*//)[^/:]+(.*)", "$1" + serverName + "$2"); + } + + setCredentials(dbUrl, user, password, pgSchema); + } + + /** + * This class creates a connection with a database and allows for convenient + * queries and commands. + * + * @param dbUrl the url of the database + * @param user the username for the db + * @param password the password for the username + */ + public DataBaseConnector(String dbUrl, String user, String password) { + this(dbUrl, user, password, null, DEFAULT_QUERY_BATCH_SIZE, null); + } + + private static InputStream findConfigurationFile(String configPath) throws FileNotFoundException { + LOG.debug("Loading DatabaseConnector configuration file from path \"{}\"", configPath); + File dbcConfigFile = new File(configPath); + InputStream is; + if (dbcConfigFile.exists()) { + LOG.debug("Found database configuration at file {}", dbcConfigFile); + is = new FileInputStream(configPath); + } else { + String cpResource = configPath.startsWith("/") ? configPath : "/" + configPath; + LOG.debug("The database configuration file could not be found as a file at {}. Trying to lookup configuration as a classpath resource at {}", dbcConfigFile, cpResource); + is = DataBaseConnector.class.getResourceAsStream(cpResource); + if (is != null) + LOG.debug("Found database configuration file as classpath resource at {}", cpResource); + } + if (is == null) { + throw new IllegalArgumentException("DatabaseConnector configuration " + configPath + " could not be found as file or a classpath resource."); + } + return is; + } + + public ConfigReader getConfig() { + return config; + } + + /** + * @param dbUrl + * @param user + * @param password + * @param pgSchema + */ + private void setCredentials(String dbUrl, String user, String password, String pgSchema) { + if (dbUrl != null) + this.dbURL = dbUrl; + if (user != null) + this.user = user; + if (password != null) + this.password = password; + if (pgSchema != null) + setActivePGSchema(pgSchema); + if ((dbUrl != null) || (user != null) || (password != null) || (pgSchema != null)) + LOG.info("Connecting to " + this.dbURL + " as " + this.user + " in Postgres Schema " + pgSchema); + } + + public void setHost(String host) { + if (host != null) { + dbURL = dbURL.replaceFirst("(.*//)[^/:]+(.*)", "$1" + host + "$2"); + LOG.debug("Setting database host to {}. DB URL is now {}", host, dbURL); + } + } + + public void setPort(String port) { + setPort(Integer.parseInt(port)); + } + + public void setPort(Integer port) { + if (port != null) { + this.dbURL = dbURL.replaceFirst(":[0-9]+", ":" + port); + LOG.debug("Setting database port to {}. DB URL is now {}", port, dbURL); + } + } + + public void setUser(String user) { + this.user = user; + LOG.debug("Setting database user for {} to {}", this.dbURL, user); + } + + public void setPassword(String password) { + this.password = password; + LOG.debug("Changing database password."); + } + + public void setMaxConnections(int num) { + dbConfig.setMaxConnections(num); + } + + /** + * @return A Connection to the database. + */ + Connection getConn() { + + Connection conn = null; + synchronized (DataBaseConnector.class) { + if (null == dataSource || ((HikariDataSource) dataSource).isClosed()) { + LOG.debug("Setting up connection pool data source"); + HikariConfig hikariConfig = new HikariConfig(); + hikariConfig.setPoolName("costosys-" + System.nanoTime()); + hikariConfig.setJdbcUrl(dbURL); + hikariConfig.setUsername(user); + hikariConfig.setPassword(password); + hikariConfig.setConnectionTestQuery("SELECT TRUE"); + hikariConfig.setMaximumPoolSize(dbConfig.getMaxConnections()); + hikariConfig.setConnectionTimeout(60000); + // required to be able to get the number of idle connections, see below + hikariConfig.setRegisterMbeans(true); + HikariDataSource ds = pools.compute(dbURL, (url, source) -> source == null ? new HikariDataSource(hikariConfig) : source); + if (ds.isClosed()) { + ds = new HikariDataSource(hikariConfig); + } + pools.put(dbURL, ds); + dataSource = ds; + } + } + + try { + int retries = 0; + do { + try { + LOG.trace("Waiting for SQL connection to become free..."); + if (LOG.isTraceEnabled()) { + String poolName = dataSource.getPoolName(); + HikariPoolMXBean poolProxy = dataSource.getHikariPoolMXBean(); + int totalConnections = poolProxy.getTotalConnections(); + int idleConnections = poolProxy.getIdleConnections(); + int activeConnections = poolProxy.getActiveConnections(); + int threadsAwaitingConnection = poolProxy.getThreadsAwaitingConnection(); + LOG.trace("Pool {} has {} total connections", poolName, totalConnections); + LOG.trace("Pool {} has {} idle connections left", poolName, idleConnections); + LOG.trace("Pool {} has {} active connections", poolName, activeConnections); + LOG.trace("Pool {} has {} threads awaiting a connection", poolName, threadsAwaitingConnection); + + } + conn = dataSource.getConnection(); + // conn = DriverManager.getConnection(fullURI); + LOG.trace("SQL connection obtained."); + Statement stm = conn.createStatement(); + if (!schemaExists(dbConfig.getActivePGSchema(), conn)) + createSchema(dbConfig.getActivePGSchema(), conn); + if (!schemaExists(dbConfig.getActiveDataPGSchema(), conn)) + createSchema(dbConfig.getActiveDataPGSchema(), conn); + stm.execute(String.format("SET search_path TO %s", dbConfig.getActivePGSchema())); + stm.close(); + } catch (SQLException e) { + LOG.warn("SQLException occurred:", e); + LOG.warn("Could not obtain a database connection within the timeout for thread {}. Trying again. Number of try: {}", Thread.currentThread().getName(), ++retries); + MBeanServer mBeanServer = ManagementFactory.getPlatformMBeanServer(); + try { + String poolNameStr = ((HikariDataSource) dataSource).getPoolName(); + ObjectName poolName = new ObjectName("com.zaxxer.hikari:type=Pool (" + poolNameStr + ")"); + HikariPoolMXBean poolProxy = JMX.newMXBeanProxy(mBeanServer, poolName, HikariPoolMXBean.class); + int totalConnections = poolProxy.getTotalConnections(); + int idleConnections = poolProxy.getIdleConnections(); + int activeConnections = poolProxy.getActiveConnections(); + int threadsAwaitingConnection = poolProxy.getThreadsAwaitingConnection(); + LOG.warn("Pool {} has {} total connections", poolName, totalConnections); + LOG.warn("Pool {} has {} idle connections left", poolName, idleConnections); + LOG.warn("Pool {} has {} active connections", poolName, activeConnections); + LOG.warn("Pool {} has {} threads awaiting a connection", poolName, threadsAwaitingConnection); + + } catch (Throwable t) { + LOG.warn("Could not retrieve connection pool statistics: {}. More information can be found on DEBUG level.", t.getMessage()); + LOG.debug("Could not retrieve connection pool statistics:", t); + } + if (retries == 3) + throw e; + } + } while (conn == null); + if (retries > 0) + LOG.warn("It took {} retries to obtain a connection", retries); + } catch (SQLException e) { + LOG.error("Could not connect with " + dbURL); + throw new UnobtainableConnectionException("No database connection could be obtained from the connection " + + "pool. This can have one of two causes: Firstly, the application might just use all connections " + + "concurrently. Then, a higher number of maximum active database connections in the CoStoSys " + + "configuration might help. This " + + "number is currently set to " + config.getDatabaseConfig().getMaxConnections() + ". The other " + + "possibility are programming errors where connections are retrieved but not closed. Closing " + + "connections means to return them to the pool. It must always be made sure that connections are " + + "closed when they are no longer required. If database iterators are used. i.e. subclasses of " + + "DBCIterator, make sure to fully read the iterators. Otherwise, they might keep a permanent " + + "connection to the database while waiting to be consumed.", e); + } + return conn; + } + + + /** + * @return the activeDataTable + */ + public String getActiveDataTable() { + return activeDataTable; + } + + /** + *

+ * Returns the effective XML configuration as a byte[]. + *

+ *

+ * The effective configuration consists of the default configuration and the + * given user configuration as well (merged by the ConfigReader in the + * constructor). + *

+ * + * @return the effectiveConfiguration + */ + public byte[] getEffectiveConfiguration() { + return effectiveConfiguration; + } + + public String getActiveDataPGSchema() { + return activeDataSchema; + } + + public String getActivePGSchema() { + return dbConfig.getActivePGSchema(); + } + + public void setActivePGSchema(String pgSchema) { + dbConfig.setActivePGSchema(pgSchema); + } + + public String getActiveTableSchema() { + return activeTableSchema; + } + + public void setActiveTableSchema(String schemaName) { + this.activeTableSchema = schemaName; + } + + public FieldConfig getActiveTableFieldConfiguration() { + return fieldConfigs.get(activeTableSchema); + } + + /** + *

+ * Retrieves from a subset-table limit primary keys whose rows are + * not marked to be in process or finished being processed and sets the rows of + * the retrieved primary keys as being "in process". + *

+ *

+ * The table is locked during this transaction. Locking and marking ensure that + * every primary key will be returned exactly once. Remember to remove the marks + * if you want to use the subset again ;) + *

+ * + * @param subsetTableName - name of a table, conforming to the subset standard + * @param hostName - will be saved in the subset table + * @param pid - will be saved in the subset table + * @return An ArrayList of pmids which have not yet been processed + */ + public List retrieveAndMark(String subsetTableName, String readerComponent, String hostName, String pid) throws TableSchemaMismatchException, TableNotFoundException { + return retrieveAndMark(subsetTableName, readerComponent, hostName, pid, RETRIEVE_MARK_LIMIT, null); + } + + /** + *

+ * Retrieves primary keys from a subset table and marks them as being "in + * process". The table schema - and thus the form of the primary keys - is + * assumed to match the active table schema determined in the configuration + * file. + *

+ * The table is locked during this transaction. Locking and marking ensure that + * every primary key will be returned exactly once. Remember to remove the marks + * if you want to use the subset again ;) + * + * @param subsetTableName - name of a table, conforming to the subset standard + * @param hostName - will be saved in the subset table + * @param pid - will be saved in the subset table + * @param limit - batchsize for marking/retrieving + * @param order - determines an ordering. Default order (which may change over + * time) when this parameter is null or empty. + * @return An ArrayList of primary keys which have not yet been processed. + * @see #retrieveAndMark(String, String, String, String, int, String) + */ + public List retrieveAndMark(String subsetTableName, String readerComponent, String hostName, String pid, + int limit, String order) throws TableSchemaMismatchException, TableNotFoundException { + return retrieveAndMark(subsetTableName, activeTableSchema, readerComponent, hostName, pid, limit, order); + } + + /** + *

+ * Retrieves from a subset-table limit primary keys whose rows are + * not marked to be in process or finished being processed and sets the rows of + * the retrieved primary keys as being "in process". + *

+ *

+ * The following parameters may be set: + *

    + *
  • limit - sets the maximum number of primary keys retrieved + *
  • order - determines whether to retrieve the primary keys in a + * particular order. Note that the default order of rows is undefined. If you + * need the same order in every run, you should specify some ordering as an SQL + * 'ORDER BY' statement. When order is not prefixed with 'ORDER BY' + * (case ignored), it will be inserted. + *
+ *

+ *

+ * The table is locked during this transaction. Locking and marking ensure that + * every primary key will be returned exactly once. Remember to remove the marks + * if you want to use the subset again ;) + *

+ * + * @param subsetTableName - name of a table, conforming to the subset standard + * @param hostName - will be saved in the subset table + * @param pid - will be saved in the subset table + * @param limit - batchsize for marking/retrieving + * @param order - determines an ordering. Default order (which may change over + * time) when this parameter is null or empty. + * @return An ArrayList of primary keys which have not yet been processed. + */ + public List retrieveAndMark(String subsetTableName, String schemaName, String readerComponent, + String hostName, String pid, int limit, String order) throws TableSchemaMismatchException, TableNotFoundException { + checkTableDefinition(subsetTableName, schemaName); + List ids = new ArrayList<>(limit); + String sql = null; + Connection conn = null; + boolean idsRetrieved = false; + while (!idsRetrieved) { + try (CoStoSysConnection costoConn = obtainOrReserveConnection()){ + FieldConfig fieldConfig = fieldConfigs.get(schemaName); + conn = costoConn.getConnection(); + + conn.setAutoCommit(false); + Statement st = conn.createStatement(); + String orderCommand = order == null ? "" : order; + if (!orderCommand.equals("") && !orderCommand.trim().toUpperCase().startsWith("ORDER BY")) + orderCommand = "ORDER BY " + orderCommand; + String joinStatement = Stream.of(fieldConfig.getPrimaryKey()).map(pk -> { + return "t." + pk + "=subquery." + pk; + }).collect(Collectors.joining(" AND ")); + String returnColumns = Stream.of(fieldConfig.getPrimaryKey()).map(pk -> { + return "t." + pk; + }).collect(Collectors.joining(",")); + + // following + // http://dba.stackexchange.com/questions/69471/postgres-update-limit-1 + sql = "UPDATE " + subsetTableName + " AS t SET " + Constants.IN_PROCESS + " = TRUE, " + + Constants.LAST_COMPONENT + " = '" + readerComponent + "', " + Constants.HOST_NAME + " = \'" + + hostName + "\', " + Constants.PID + " = \'" + pid + "\'," + Constants.PROCESSING_TIMESTAMP + + " = 'now' FROM (SELECT " + fieldConfig.getPrimaryKeyString() + " FROM " + subsetTableName + + " WHERE " + Constants.IN_PROCESS + " = FALSE AND " + // eigentlich wollen wir anstelle von FOR UPDATE sogar: + // FOR UPDATE SKIP LOCKED in PostgreSQL 9.5 <---!! + + Constants.IS_PROCESSED + " = FALSE " + orderCommand + " LIMIT " + limit + + " FOR UPDATE SKIP LOCKED) AS subquery WHERE " + joinStatement + " RETURNING " + returnColumns; + try (ResultSet res = st.executeQuery(sql)) { + String[] pks = fieldConfig.getPrimaryKey(); + while (res.next()) { + Object[] values = new String[pks.length]; + for (int i = 0; i < pks.length; i++) { + values[i] = res.getObject(i + 1); + } + ids.add(values); + } + idsRetrieved = true; + } + conn.commit(); + } catch (SQLException e) { + // It is possible to run into deadlocks with the above query. Then, one process + // will be canceled and we get an exception. If so, just log is and try again. + if (!e.getMessage().contains("deadlock detected") && (e.getNextException() == null + || !e.getNextException().getMessage().contains("deadlock detected"))) { + LOG.error( + "Error while retrieving document IDs and marking them to be in process. Sent SQL command: {}.", + sql, e); + SQLException nextException = e.getNextException(); + if (null != nextException) + LOG.error("Next exception: {}", nextException); + // this is not the deadlock error; break the loop + break; + } else { + LOG.debug( + "Database deadlock has been detected while trying to retrieve document IDs and marking them to be processed. Tying again."); + // We need to close the current, failed, transaction and start a new one for the + // new try. + try { + conn.commit(); + } catch (SQLException e1) { + e1.printStackTrace(); + } + } + } + } + if (LOG.isTraceEnabled()) { + LOG.trace("The following IDs were retrieved from table {}: {}", subsetTableName, ids.stream().map(Arrays::toString).collect(Collectors.joining("; "))); + } + return ids; + } + + /** + * @param subsetTableName + * @return + * @see #countUnprocessed(String) + */ + public int countUnprocessed(String subsetTableName) { + return countUnprocessed(subsetTableName, activeTableSchema); + } + + /** + * Counts the unprocessed rows in a subset table + * + * @param subsetTableName - name of the subset table + * @return - number of rows + */ + public int countUnprocessed(String subsetTableName, String schemaName) { + FieldConfig fieldConfig = fieldConfigs.get(schemaName); + + int rows = 0; + try (CoStoSysConnection conn = obtainOrReserveConnection()) { + ResultSet res = conn.getConnection().createStatement().executeQuery( + // as we are just looking for any unprocessed documents it + // is + // sufficient - even in the case of multiple primary key + // elements - to use the name of the first element + // in this command + "SELECT count(" + fieldConfig.getPrimaryKey()[0] + ")" + " FROM " + subsetTableName + " WHERE " + + Constants.PROCESSED + " = FALSE;"); + if (res.next()) + rows = res.getInt(1); + } catch (SQLException e) { + e.printStackTrace(); + } + return rows; + } + + public int countRowsOfDataTable(String tableName, String whereCondition) { + return countRowsOfDataTable(tableName, whereCondition, activeTableSchema); + } + + public int countRowsOfDataTable(String tableName, String whereCondition, String schemaName) { + FieldConfig fieldConfig = fieldConfigs.get(schemaName); + + int rows = 0; + try (CoStoSysConnection conn = obtainOrReserveConnection()) { + if (whereCondition != null) { + whereCondition = whereCondition.trim(); + if (!whereCondition.toUpperCase().startsWith("WHERE")) + whereCondition = " WHERE " + whereCondition; + else + whereCondition = " " + whereCondition; + } else + whereCondition = ""; + + ResultSet res = conn.createStatement().executeQuery( + "SELECT count(" + fieldConfig.getPrimaryKeyString() + ")" + " FROM " + tableName + whereCondition); + if (res.next()) + rows = res.getInt(1); + } catch (SQLException e) { + e.printStackTrace(); + } + return rows; + } + + public boolean hasUnfetchedRows(String tableName) { + return hasUnfetchedRows(tableName, activeTableSchema); + } + + /************************************************************************** + ******************************** Utility ********************************** + ***************************************************************************/ + + public boolean hasUnfetchedRows(String tableName, String schemaName) { + FieldConfig fieldConfig = fieldConfigs.get(schemaName); + + try (CoStoSysConnection conn = obtainOrReserveConnection()) { + ResultSet res = conn.createStatement() + .executeQuery("SELECT " + fieldConfig.getPrimaryKeyString() + " FROM " + tableName + " WHERE " + + Constants.IN_PROCESS + " = FALSE AND " + Constants.IS_PROCESSED + " = FALSE LIMIT 1"); + return res.next(); + } catch (SQLException e) { + e.printStackTrace(); + } + return false; + } + + /** + * Deletes entries from a table + * + * @param table name of the table + * @param ids primary key arrays defining the entries to delete + * @see #deleteFromTableSimplePK(String, List) + */ + public void deleteFromTable(String table, List ids) { + String sql = "DELETE FROM " + table + " WHERE "; + modifyTable(sql, ids); + } + + /** + * Deletes entries from a table where the primary key of this table must consist + * of exactly one column. For deletion from tables which contain a + * multi-column-primary-key see {@link #deleteFromTable(String, List)}. + * + * @param table name of the table + * @param ids primary key arrays defining the entries to delete + * @see #deleteFromTable(String, List) + */ + public void deleteFromTableSimplePK(String table, List ids) { + String sql = "DELETE FROM " + table + " WHERE "; + + // Convert the given list to a list of object arrays, so it fits to + // 'modifyTable'. + List objectIds = new ArrayList(ids.size()); + for (T id : ids) + objectIds.add(new Object[]{id}); + modifyTable(sql, objectIds); + } + + /** + * Modifies a subset table, marking entries as processed. + * + * @param table name of the subset table + * @param ids primary key arrays defining the entries to delete + */ + public void markAsProcessed(String table, List ids) { + String sql = "UPDATE " + table + " SET " + Constants.PROCESSED + " = TRUE WHERE "; + modifyTable(sql, ids); + } + + /** + *

+ * Executes a given SQL command (must end with "WHERE "!) an extends the + * WHERE-clause with the primary keys, set to the values in ids. + *

+ *

+ * Assumes that the form of the primary keys matches the definition given in the + * active table schema in the configuration. + *

+ * + * @param sql a valid SQL command, ending with "WHERE " + * @param ids list of primary key arrays + * @see #modifyTable(String, List) + */ + public void modifyTable(String sql, List ids) { + modifyTable(sql, ids, activeTableSchema); + } + + /** + *

+ * Executes a given SQL command (must end with "WHERE "!) an extends the + * WHERE-clause with the primary keys, set to the values in ids. + *

+ * + * @param sql a valid SQL command, ending with "WHERE " + * @param ids list of primary key arrays + * @param schemaName name of the schema which defines the primary keys + */ + public void modifyTable(String sql, List ids, String schemaName) { + FieldConfig fieldConfig = fieldConfigs.get(schemaName); + + try (CoStoSysConnection conn = obtainOrReserveConnection()) { + String where = StringUtils.join(fieldConfig.expandPKNames("%s = ?"), " AND "); + String fullSQL = sql + where; + PreparedStatement ps = null; + try { + conn.setAutoCommit(false); + ps = conn.prepareStatement(fullSQL); + } catch (SQLException e) { + LOG.error("Couldn't prepare: " + fullSQL); + e.printStackTrace(); + } + String[] pks = fieldConfig.getPrimaryKey(); + for (Object[] id : ids) { + for (int i = 0; i < id.length; ++i) { + try { + setPreparedStatementParameterWithType(i + 1, ps, id[i], pks[i], fieldConfig); + } catch (SQLException e) { + e.printStackTrace(); + } + } + try { + ps.addBatch(); + } catch (SQLException e) { + e.printStackTrace(); + } + } + try { + ps.executeBatch(); + conn.commit(); + } catch (SQLException e) { + e.printStackTrace(); + } + } + } + + /** + * just calls ps.setObject(position, value); + * + * @param position + * @param ps + * @param value + * @param fieldName + * @param fieldConfig + * @throws SQLException + */ + private void setPreparedStatementParameterWithType(int position, PreparedStatement ps, Object value, + String fieldName, FieldConfig fieldConfig) throws SQLException { + ps.setObject(position, value); + } + + /** + * Returns the name of a table referenced by an SQL-foreign-key. + * + * @param referencingTable the name of the table for which the foreign keys shall be checked + * @return the name of the first referenced table or null if there + * is no referenced table (i.e. the passed table name denotes a data + * table). + * @throws IllegalArgumentException When referencingTable is null. + */ + public String getReferencedTable(String referencingTable) { + if (referencingTable == null) + throw new IllegalArgumentException("Name of referencing table may not be null."); + + String referencedTable = null; + try (CoStoSysConnection conn = obtainOrReserveConnection()) { + String pgSchema = dbConfig.getActivePGSchema(); + String tableName = referencingTable; + if (referencingTable.contains(".")) { + pgSchema = referencingTable.replaceFirst("\\..*$", ""); + tableName = referencingTable.substring(referencingTable.indexOf('.') + 1); + } + // Lowercasing of the table name since case matters but postgres + // does lowercase on table creation. + ResultSet imported = conn.getMetaData().getImportedKeys("", pgSchema, tableName.toLowerCase()); + + if (imported.next()) { + String pkTableSchema = imported.getString(2); + String pkTableName = imported.getString(3); + referencedTable = pkTableSchema != null ? pkTableSchema + "." + pkTableName : pkTableName; + } + } catch (SQLException e1) { + throw new CoStoSysSQLRuntimeException(e1); + } + return referencedTable; + } + + /** + * Creates a PostgreSQL schema + *

+ * This private method is called by the SQL Connection source, thus + * it takes the Connection as a parameter instead of getting a + * Connection on its own. + *

+ * + * @param schemaName The name of the PostgreSQL schema to create. + * @param conn Connection to the database which should be checked for the + * existence of the schema schemaName. + */ + private void createSchema(String schemaName, Connection conn) { + String sqlStr = "CREATE SCHEMA " + schemaName; + try { + conn.createStatement().execute(sqlStr); + LOG.info("PostgreSQL schema \"{}\" does not exist, it is being created.", schemaName); + } catch (SQLException e) { + LOG.error(sqlStr); + e.printStackTrace(); + } + } + + /** + * Creates the PostgreSQL schema schemaName in the active database. + * + * @param schemaName The name of the PostgreSQL schema to create. + */ + public void createSchema(String schemaName) { + try (CoStoSysConnection conn = obtainOrReserveConnection()) { + createSchema(schemaName, conn.getConnection()); + } + } + + /** + * Creates a new table according to the field schema definition corresponding to + * the active schema name determined in the configuration. + * + * @param tableName the name of the new table + * @throws SQLException + */ + public void createTable(String tableName, String comment) throws SQLException { + createTable(tableName, activeTableSchema, comment); + } + + /** + * Creates a new table according to the field schema definition corresponding to + * the name schemaName given in the configuration file. + * + * @param tableName the name of the new table + * @throws SQLException + */ + public void createTable(String tableName, String schemaName, String comment) { + FieldConfig fieldConfig = fieldConfigs.get(schemaName); + + ArrayList columns = getTableCreationColumns(tableName, fieldConfig); + + createTable(tableName, columns, comment); + + // additionally, restrict the primary key to be unique + // (I don't know why this is necessary, but it is required + // for a referencing table which references several columns, + // that these columns own a UNIQUE constraint.) + if (fieldConfig.getPrimaryKey().length > 0) + alterTable(String.format("ADD CONSTRAINT %s_unique UNIQUE (%s)", tableName.replace(".", ""), + fieldConfig.getPrimaryKeyString()), tableName); + } + + /** + *

+ * Creates a new table according to the field schema definition corresponding to + * the name schemaName and with foreign key references to the + * primary key of referenceTableName. + *

+ *

+ * The primary key of the tables tableName and + * referenceTableName must be equal. The foreign key constraint is + * configured for ON DELETE CASCADE which means, when in the referenced + * table rows are deleted, there are also deleted in the table created by this + * method call. + *

+ * + * @param tableName The name of the new table. + * @param referenceTableName The table to be referenced by this table. + * @param schemaName The table schema determining the structure (especially the primary + * key) of the new table. + * @param comment A comment for the new table. + * @throws SQLException + */ + public void createTable(String tableName, String referenceTableName, String schemaName, String comment) { + FieldConfig fieldConfig = fieldConfigs.get(schemaName); + + ArrayList columns = getTableCreationColumns(tableName, fieldConfig); + columns.add(String.format("CONSTRAINT %s_fkey FOREIGN KEY (%s) REFERENCES %s ON DELETE CASCADE", + tableName.replace(".", ""), fieldConfig.getPrimaryKeyString(), referenceTableName)); + + createTable(tableName, columns, comment); + + // additionally, restrict the primary key to be unique + // (I don't know why this is necessary, but it is required + // for a referencing table which references several columns, + // that these columns own a UNIQUE constraint.) + if (fieldConfig.getPrimaryKey().length > 0) + alterTable(String.format("ADD CONSTRAINT %s_unique UNIQUE (%s)", tableName.replace(".", ""), + fieldConfig.getPrimaryKeyString()), tableName); + } + + /** + * Creates the columns to create a table according to the table schema given by + * fieldConfig for use with {@link #createTable(String, List, String)}. + * + * @param tableName + * @param fieldConfig + * @return + */ + private ArrayList getTableCreationColumns(String tableName, FieldConfig fieldConfig) { + ArrayList columns = new ArrayList(); + for (Map field : fieldConfig.getFields()) { + StringBuilder columnStrBuilder = new StringBuilder(); + columnStrBuilder.append(field.get(JulieXMLConstants.NAME)); + columnStrBuilder.append(" "); + columnStrBuilder.append(field.get(JulieXMLConstants.TYPE)); + columns.add(columnStrBuilder.toString()); + } + if (fieldConfig.getPrimaryKey().length > 0) + columns.add(String.format("CONSTRAINT %s_pkey PRIMARY KEY (%s)", tableName.replace(".", ""), + fieldConfig.getPrimaryKeyString())); + return columns; + } + + /** + * Creates a new table with custom columns. + * + * @param tableName the name of the new table + * @param columns a list of Strings, each containing name, type and constraint of a + * column, e.g. "foo integer primary key" as required for a valid sql + * command. + * @throws CoStoSysSQLRuntimeException If the SQL command fails. + */ + private void createTable(String tableName, List columns, String comment) { + try (CoStoSysConnection conn = obtainOrReserveConnection()) { + StringBuilder sb = new StringBuilder("CREATE TABLE " + tableName + " ("); + for (String column : columns) + sb.append(", " + column); + sb.append(");"); + String sqlString = sb.toString().replaceFirst(", ", ""); + try { + Statement st = conn.createStatement(); + st.execute(sqlString); + st.execute("COMMENT ON TABLE " + tableName + " IS \'" + comment + "\';"); + } catch (SQLException e) { + System.err.println(sqlString); + e.printStackTrace(); + throw new CoStoSysSQLRuntimeException(e); + } + } + } + + /** + *

+ * Does the same as {@link #createSubsetTable(String, String, Integer, String, String)} + * with the exception that the assumed table schema is that of the active schema + * defined in the configuration file. + *

+ * + * @param subsetTable name of the subset table + * @param supersetTable name of the referenced table + * @param maxNumberRefHops the maximum number of times a foreign key reference to a data + * table may be followed + * @param comment will be added to the table in the database, used to make tables + * reproducable + * @throws SQLException + */ + public void createSubsetTable(String subsetTable, String supersetTable, Integer maxNumberRefHops, String comment) + throws SQLException { + createSubsetTable(subsetTable, supersetTable, maxNumberRefHops, comment, activeTableSchema); + } + + /** + *

+ * Does the same as {@link #createSubsetTable(String, String, Integer, String, String)} + * with the exception that the assumed table schema is that of the active schema + * defined in the configuration file and the first referenced data table is used as data table. + *

+ * + * @param subsetTable name of the subset table + * @param supersetTable name of the referenced table + * @param comment will be added to the table in the database, used to make tables + * reproducable + * @throws SQLException + */ + public void createSubsetTable(String subsetTable, String supersetTable, String comment) throws SQLException { + createSubsetTable(subsetTable, supersetTable, null, comment, activeTableSchema); + } + + /** + *

+ * Creates an empty table referencing the primary key of the data table given by + * superSetTable or, if this is a subset table itself, the data + * table referenced by that table. + *

+ *

+ * To fill the empty subset table with data, use one of the + * init[...] methods offered by this class. + *

+ *

+ * Subset tables have a particular table scheme. They define a foreign key to + * the primary key of the referenced data table. There are the following + * additional columns: + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
NameType
is_in_processboolean
is_processedboolean
last_componenttext
logtext
has errorsboolean
pidcharacter varying(10)
host_namecharacter varying(100)
processing_timestamptimestamp without time zone
+ *

+ *

+ * The subset table can be used for processing, e.g. by UIMA CollectionReaders, + * which store information about the processing in it. + *

+ * The actual data is located in the referenced table. + * + * @param subsetTable name of the subset table + * @param supersetTable name of the referenced table + * @param posOfDataTable the position of the datatable that should be referenced; the 1st + * would be nearest data table, i.e. perhaps supersetTable + * itself. The 2nd would be the datatable referenced by the first + * data table on the reference path. + * @param schemaName name of the table schema to work with (determined in the + * configuration file) + * @param comment will be added to the table in the database, used to make tables + * reproducable + * @throws SQLException + */ + public void createSubsetTable(String subsetTable, String supersetTable, Integer posOfDataTable, String comment, + String schemaName) throws SQLException { + FieldConfig fieldConfig = fieldConfigs.get(schemaName); + + String effectiveDataTable = getReferencedTable(supersetTable, posOfDataTable); + + ArrayList columns = new ArrayList(); + List> fields = fieldConfig.getFields(); + HashSet pks = new HashSet(Arrays.asList(fieldConfig.getPrimaryKey())); + for (Map field : fields) { + String name = field.get(JulieXMLConstants.NAME); + if (pks.contains(name)) + columns.add(name + " " + field.get(JulieXMLConstants.TYPE)); + } + + // Add the columns to the table. + for (Entry columnDefinition : subsetColumns.entrySet()) { + columns.add(columnDefinition.getKey() + " " + columnDefinition.getValue()); + } + // Define the primary key of the table. + String pkStr = fieldConfig.getPrimaryKeyString(); + columns.add(String.format("CONSTRAINT %s_pkey PRIMARY KEY (%s)", subsetTable.replace(".", ""), pkStr)); + columns.add(String.format("CONSTRAINT %s_fkey FOREIGN KEY (%s) REFERENCES %s ON DELETE CASCADE", + subsetTable.replace(".", ""), pkStr, effectiveDataTable)); + createTable(subsetTable, columns, comment); + createIndex(subsetTable, Constants.IS_PROCESSED, Constants.IN_PROCESS); + } + + /** + * Creates an index for table table on the given columns. The + * name of the index will be <table>_idx. It is currently not + * possible to create a second index since the names would collide. This would + * require an extension of this method for different names. + * + * @param table The table for which an index should be created. + * @param columns The columns the index should cover. + * @throws SQLException In case something goes wrong. + */ + public void createIndex(String table, String... columns) throws SQLException { + try (CoStoSysConnection conn = obtainOrReserveConnection()) { + String sql = String.format("CREATE INDEX %s_idx ON %s (%s)", table.replace(".", ""), table, + String.join(",", columns)); + conn.createStatement().execute(sql); + } + } + + /** + * Gets the - possibly indirectly - referenced table of startTable + * where posOfDataTable specifies the position of the desired table in + * the reference chain starting at startTable. + * + * @param startTable + * @param posOfDataTable + * @return + * @throws SQLException + */ + public String getReferencedTable(String startTable, Integer posOfDataTable) throws SQLException { + if (posOfDataTable == null) + posOfDataTable = 1; + int currentDatatablePosition = isDataTable(startTable) ? 1 : 0; + Set blacklist = new HashSet<>(); + String effectiveDataTable = startTable; + String lasttable = ""; + while (isSubsetTable(effectiveDataTable) || currentDatatablePosition < posOfDataTable) { + if (blacklist.contains(effectiveDataTable)) { + if (effectiveDataTable.equals(lasttable)) + throw new IllegalStateException( + "The table \"" + lasttable + "\" has a foreign key on itself. This is not allowed."); + throw new IllegalStateException( + "Fatal error: There is a circel in the foreign key chain. The table \"" + effectiveDataTable + + "\" has been found twice when following the foreign key chain of the table \"" + + startTable + "\"."); + } + blacklist.add(effectiveDataTable); + lasttable = effectiveDataTable; + effectiveDataTable = getNextDataTable(effectiveDataTable); + currentDatatablePosition++; + } + return effectiveDataTable; + } + + /** + * Follows the foreign-key specifications of the given table to the referenced table. This process is repeated until + * a non-subset table (a table for which {@link #isSubsetTable(String)} returns false) is encountered + * or a table without a foreign-key is found. If referencingTable has no foreign-key itself, null is returned + * since the referenced table does not exist. + * + * @param referencingTable The table to get the next referenced data table for, possibly across other subsets if referencingTable denotes a subset table.. + * @return The found data table or null, if referencingTable is a data table itself. + * @throws CoStoSysSQLRuntimeException If table meta data checking fails. + */ + public String getNextDataTable(String referencingTable) { + String referencedTable = getReferencedTable(referencingTable); + while (isSubsetTable(referencedTable)) { + referencedTable = getReferencedTable(referencedTable); + } + return referencedTable; + } + + /** + * Determines the first data table on the reference path referencingTable -> table1 -> table2 -> ... -> lastTable -> null + * referenced from referencingTable. This means that referencingTable is returned itself + * if it is a data table. + * + * @param referencingTable The start point table for the path for which the first data table is to be returned. + * @return The first data table on the foreign-key path beginning with referencingTable itself. + * @throws SQLException If a database operation fails. + */ + public String getNextOrThisDataTable(String referencingTable) { + if (isDataTable(referencingTable)) + return referencingTable; + return getNextDataTable(referencingTable); + } + + /** + *

+ * Checks if the given table is a subset table. + *

+ *

A database table is identified to be a subset table if it exhibits all the column names that subsets + * have. Those are defined in {@link #subsetColumns}.

+ * + * @param table The table to check for being a subset table. + * @return True, iff table denotes a subset table, false otherwise. The latter case includes the table parameter being null. + * @throws SQLException If table meta data checking fails. + */ + public boolean isSubsetTable(String table) { + if (table == null) + return false; + try (CoStoSysConnection conn = obtainOrReserveConnection()) { + String pgSchema = dbConfig.getActivePGSchema(); + String tableName = table; + if (table.contains(".")) { + pgSchema = table.replaceFirst("\\..*$", ""); + tableName = table.substring(table.indexOf('.') + 1); + } + try { + // Do lowercase on the table name: Case matters and postgres always + // lowercases the names on creation... + ResultSet columns = conn.getMetaData().getColumns(null, pgSchema, tableName.toLowerCase(), null); + int numSubsetColumnsFound = 0; + while (columns.next()) { + String columnName = columns.getString(4); + if (subsetColumns.keySet().contains(columnName)) + numSubsetColumnsFound++; + } + return numSubsetColumnsFound == subsetColumns.size(); + } catch (SQLException e) { + throw new CoStoSysSQLRuntimeException(e); + } + } + } + + public boolean isDataTable(String table) { + return !isSubsetTable(table); + } + + public boolean dropTable(String table) throws SQLException { + try (CoStoSysConnection conn = obtainOrReserveConnection()) { + Statement stmt = conn.createStatement(); + String sql = "DROP TABLE " + table; + return stmt.execute(sql); + } + } + + /** + * Tests if a table exists. + * + * @param tableName name of the table to test + * @return true if the table exists, false otherwise + */ + public boolean tableExists(CoStoSysConnection conn, String tableName) { + if (tableName == null) + throw new IllegalArgumentException("The passed table name is null."); + try { + Statement stmt = conn.createStatement(); + String pureTableName = tableName; + String schemaName = dbConfig.getActivePGSchema(); + if (tableName.contains(".")) { + String[] split = tableName.split("\\."); + schemaName = split[0]; + pureTableName = split[1]; + } + // Lowercase the names because in Postgres they are lowercased + // automatically when the tables are created. Thus, when not + // lowercasing we risk to miss the correct entry. + String sql = String.format( + "select schemaname,tablename from pg_tables where schemaname = '%s' and tablename = '%s'", + schemaName.toLowerCase(), pureTableName.toLowerCase()); + LOG.trace("Checking whether table {} in schema {} exists.", pureTableName, schemaName); + LOG.trace("Sent query (names have been lowercased to match Postgres table names): {}", sql); + ResultSet res = stmt.executeQuery(sql); + return res.next(); + } catch (SQLException e) { + e.printStackTrace(); + SQLException ne = e.getNextException(); + if (null != ne) + ne.printStackTrace(); + } + return false; + } + + /** + * Tests if a table exists. + * + * @param tableName name of the table to test + * @return true if the table exists, false otherwise + */ + public boolean tableExists(String tableName) { + try (CoStoSysConnection conn = obtainOrReserveConnection()) { + return tableExists(conn, tableName); + } + } + + /** + * Tests if a schema exists. + *

+ * This private method is called by the SQL Connection source, thus + * it takes the Connection as a parameter instead of getting a + * Connection on its own. + *

+ * + * @param schemaName name of the schema to test + * @param conn Connection to the database which should be checked for the + * existence of the schema schemaName. + * @return true if the schema exists, false otherwise + */ + private boolean schemaExists(String schemaName, Connection conn) { + try { + ResultSet rs = conn.createStatement() + .executeQuery("SELECT * FROM pg_namespace WHERE nspname = '" + schemaName + "'"); + return rs.next(); + } catch (SQLException e) { + e.printStackTrace(); + } + return false; + } + + /** + * Tests if a schema exists. + * + * @param schemaName name of the schema to test + * @return true if the schema exists, false otherwise + */ + public boolean schemaExists(String schemaName) { + try (CoStoSysConnection conn = obtainOrReserveConnection()) { + boolean exists = schemaExists(schemaName, conn.getConnection()); + + return exists; + } + } + + /** + * Tests if a table contains entries. + * + * @param tableName name of the schema to test + * @return true if the table has entries, false otherwise + */ + public boolean isEmpty(String tableName) { + + String sqlStr = "SELECT * FROM " + tableName + " LIMIT 1"; + try (CoStoSysConnection conn = obtainOrReserveConnection()) { + Statement st = conn.createStatement(); + ResultSet res = st.executeQuery(sqlStr); + + return !res.next(); + } catch (SQLException e) { + e.printStackTrace(); + } + return false; + } + + /************************************************************************** + ********************************* Data Import ***************************** + **************************************************************************/ + + /** + *

+ * Convenience method for creating and initializing a subset in one step. See + * method references below for more information. + *

+ * + * @param size + * @param subsetTable + * @param supersetTable + * @param comment + * @throws SQLException + * @see #initRandomSubset(int, String, String) + */ + public void defineRandomSubset(int size, String subsetTable, String supersetTable, String comment) + throws SQLException { + createSubsetTable(subsetTable, supersetTable, comment); + initRandomSubset(size, subsetTable, supersetTable); + } + + /** + *

+ * Convenience method for creating and initializing a subset in one step. See + * method references below for more information. + *

+ * + * @param size + * @param subsetTable + * @param supersetTable + * @param comment + * @param schemaName + * @throws SQLException + * @see #initRandomSubset(int, String, String, String) + */ + public void defineRandomSubset(int size, String subsetTable, String supersetTable, String comment, + String schemaName) throws SQLException { + createSubsetTable(subsetTable, supersetTable, null, schemaName, comment); + initRandomSubset(size, subsetTable, supersetTable, schemaName); + } + + /** + *

+ * Convenience method for creating and initializing a subset in one step. See + * method references below for more information. + *

+ * + * @param values + * @param subsetTable + * @param supersetTable + * @param columnToTest + * @param comment + * @throws SQLException + * @see #initSubset(List, String, String, String) + */ + public void defineSubset(List values, String subsetTable, String supersetTable, String columnToTest, + String comment) throws SQLException { + createSubsetTable(subsetTable, supersetTable, comment); + initSubset(values, subsetTable, supersetTable, columnToTest); + } + + /** + *

+ * Convenience method for creating and initializing a subset in one step. See + * method references below for more information. + *

+ * + * @param values + * @param subsetTable + * @param supersetTable + * @param columnToTest + * @param comment + * @param schemaName + * @throws SQLException + * @see #initSubset(List, String, String, String, String) + */ + public void defineSubset(List values, String subsetTable, String supersetTable, String columnToTest, + String comment, String schemaName) throws SQLException { + createSubsetTable(subsetTable, supersetTable, null, comment, schemaName); + initSubset(values, subsetTable, supersetTable, columnToTest, schemaName); + } + + /** + *

+ * Convenience method for creating and initializing a subset in one step. See + * method references below for more information. + *

+ * + * @param subsetTable + * @param supersetTable + * @param comment + * @throws SQLException + * @see #initSubset(String, String) + */ + public void defineSubset(String subsetTable, String supersetTable, String comment) throws SQLException { + createSubsetTable(subsetTable, supersetTable, comment); + initSubset(subsetTable, supersetTable); + } + + /** + *

+ * Convenience method for creating and initializing a subset in one step. See + * method references below for more information. + *

+ * + * @param subsetTable + * @param supersetTable + * @param comment + * @param schemaName + * @throws SQLException + * @see #initSubset(List, String, String, String, String) + */ + public void defineSubset(String subsetTable, String supersetTable, String comment, String schemaName) + throws SQLException { + createSubsetTable(subsetTable, supersetTable, null, comment, schemaName); + initSubset(subsetTable, supersetTable, schemaName); + } + + /** + *

+ * Convenience method for creating and initializing a subset in one step. See + * method references below for more information. + *

+ * + * @param subsetTable + * @param supersetTable + * @param conditionToCheck + * @param comment + * @throws SQLException + * @see #initSubsetWithWhereClause(String, String, String) + */ + public void defineSubsetWithWhereClause(String subsetTable, String supersetTable, String conditionToCheck, + String comment) throws SQLException { + createSubsetTable(subsetTable, supersetTable, comment); + initSubsetWithWhereClause(subsetTable, supersetTable, conditionToCheck); + } + + /** + *

+ * Convenience method for creating and initializing a subset in one step. See + * method references below for more information. + *

+ * + * @param subsetTable + * @param supersetTable + * @param conditionToCheck + * @param comment + * @param schemaName + * @throws SQLException + * @see #initSubsetWithWhereClause(String, String, String, String) + */ + public void defineSubsetWithWhereClause(String subsetTable, String supersetTable, String conditionToCheck, + String comment, String schemaName) throws SQLException { + createSubsetTable(subsetTable, supersetTable, null, comment, schemaName); + initSubsetWithWhereClause(subsetTable, supersetTable, conditionToCheck, schemaName); + } + + /** + *

+ * Convenience method for creating and initializing a subset in one step. See + * method references below for more information. + *

+ * + * @param subsetTable + * @param supersetTable + * @param comment + * @throws SQLException + */ + public void defineMirrorSubset(String subsetTable, String supersetTable, boolean performUpdate, String comment) + throws SQLException { + createSubsetTable(subsetTable, supersetTable, comment); + initMirrorSubset(subsetTable, supersetTable, performUpdate); + } + + /** + *

+ * Convenience method for creating and initializing a subset in one step. See + * method references below for more information. + *

+ * + * @param subsetTable + * @param supersetTable + * @param maxNumberRefHops the maximum number of times a foreign key reference to a data + * table may be followed + * @param comment + * @throws SQLException + * @see #createSubsetTable(String, String, Integer, String) + */ + public void defineMirrorSubset(String subsetTable, String supersetTable, boolean performUpdate, + Integer maxNumberRefHops, String comment) throws SQLException { + createSubsetTable(subsetTable, supersetTable, maxNumberRefHops, comment); + initMirrorSubset(subsetTable, supersetTable, performUpdate); + } + + /** + *

+ * Convenience method for creating and initializing a subset in one step. See + * method references below for more information. + *

+ * + * @param subsetTable + * @param supersetTable + * @param comment + * @param schemaName + * @throws SQLException + */ + public void defineMirrorSubset(String subsetTable, String supersetTable, boolean performUpdate, String comment, + String schemaName) throws SQLException { + createSubsetTable(subsetTable, supersetTable, null, comment, schemaName); + initMirrorSubset(subsetTable, supersetTable, performUpdate, schemaName); + } + + /** + * @see #initRandomSubset(int, String, String, String) + */ + public void initRandomSubset(int size, String subsetTable, String supersetTable) { + initRandomSubset(size, subsetTable, supersetTable, activeTableSchema); + } + + /** + *

+ * Selects size rows of the given super set table randomly and + * inserts them into the subset table. + *

+ * + * @param size size of the subset to create + * @param subsetTable name of subset table to insert the chosen rows into + * @param superSetTable name of the table to choose from + * @param schemaName name of the schema to use + */ + public void initRandomSubset(int size, String subsetTable, String superSetTable, String schemaName) { + FieldConfig fieldConfig = fieldConfigs.get(schemaName); + String sql = "INSERT INTO " + subsetTable + " (SELECT %s FROM " + superSetTable + " ORDER BY RANDOM() LIMIT " + + size + ");"; + sql = String.format(sql, fieldConfig.getPrimaryKeyString()); + try (CoStoSysConnection conn = obtainOrReserveConnection()) { + conn.createStatement().execute(sql); + } catch (SQLException e) { + LOG.error(sql); + e.printStackTrace(); + } + } + + // TODO: could be merged with defineSubsetWithWhereClause ? + // EF: But here the ID list is broken down into smaller lists for which the + // where clause is built. defineSubsetWithWhereClause isn't capable of such + // things. So my vote is to let it the current way (09.01.2012). + + /** + * Defines a subset by populating a subset table with primary keys from another + * table. A WHERE clause is used to control which entries are copied, checking + * if columnToTest has the desired value. + * + * @param values Desired values for the columnToTest + * @param subsetTable name of the subset table + * @param supersetTable name of table to reference + * @param columnToTest column to check for value + */ + public void initSubset(List values, String subsetTable, String supersetTable, String columnToTest) { + initSubset(values, subsetTable, supersetTable, columnToTest, activeTableSchema); + } + + /** + * Defines a subset by populating a subset table with primary keys from another + * table. A WHERE clause is used to control which entries are copied, checking + * if columnToTest has the desired value. + * + * @param values Desired values for the columnToTest + * @param subsetTable name of the subset table + * @param supersetTable name of table to reference + * @param schemaName schema to use + * @param columnToTest column to check for value + */ + public void initSubset(List values, String subsetTable, String supersetTable, String columnToTest, + String schemaName) { + FieldConfig fieldConfig = fieldConfigs.get(schemaName); + + int idSize = values.size(); + + Statement st; + String sql = null; + try (CoStoSysConnection conn = obtainOrReserveConnection()) { + st = conn.createStatement(); + for (int i = 0; i < idSize; i += ID_SUBLIST_SIZE) { + List subList = i + ID_SUBLIST_SIZE - 1 < idSize ? values.subList(i, i + ID_SUBLIST_SIZE) + : values.subList(i, idSize); + if (fieldConfig.isOfStringType(columnToTest)) + ; + String expansionString = columnToTest + " = '%s'"; + String[] expandedIDs = JulieXMLTools.expandArrayEntries(subList, expansionString); + String where = StringUtils.join(expandedIDs, " OR "); + sql = "INSERT INTO " + subsetTable + " (SELECT " + fieldConfig.getPrimaryKeyString() + " FROM " + + supersetTable + " WHERE " + where + ")"; + st.execute(sql); + } + } catch (SQLException e) { + LOG.error("SQLError while initializing subset {}. SQL query was: {}", subsetTable, sql); + e.printStackTrace(); + } + } + + /** + * Initializes subsetTable by inserting one row for each entry in supersetTable. + * + * @param subsetTable + * @param supersetTable + * @see #initSubset(String, String, String) + */ + public void initSubset(String subsetTable, String supersetTable) { + initSubset(subsetTable, supersetTable, activeTableSchema); + } + + /** + * Defines a subset by populating a subset table with all primary keys from + * another table. + * + * @param subsetTable name of the subset table + * @param supersetTable name of table to reference + * @param schemaName name of the schema used to determine the primary keys + */ + public void initSubset(String subsetTable, String supersetTable, String schemaName) { + FieldConfig fieldConfig = fieldConfigs.get(schemaName); + + if (fieldConfig.getPrimaryKey().length == 0) + throw new IllegalStateException("Not subset tables corresponding to table scheme \"" + fieldConfig.getName() + + "\" can be created since this scheme does not define a primary key."); + + + try (CoStoSysConnection conn = obtainOrReserveConnection()) { + String pkStr = fieldConfig.getPrimaryKeyString(); + + Statement st = conn.createStatement(); + String stStr = String.format("INSERT INTO %s (%s) (SELECT %s FROM %s);", subsetTable, pkStr, pkStr, + supersetTable); + st.execute(stStr); + } catch (SQLException e) { + e.printStackTrace(); + } + } + + /** + * Defines a subset by populating a subset table with primary keys from another + * table. All those entries are selected, for which the conditionToCheck is + * true. + * + * @param subsetTable name of the subset table + * @param supersetTable name of table to reference + * @param whereClause condition to check by a SQL WHERE clause, e.g. 'foo > 10' + * @see #initSubsetWithWhereClause(String, String, String, String) + */ + public void initSubsetWithWhereClause(String subsetTable, String supersetTable, String whereClause) { + initSubsetWithWhereClause(subsetTable, supersetTable, whereClause, activeTableSchema); + } + + /** + * Defines a subset by populating a subset table with primary keys from another + * table. All those entries are selected, for which the conditionToCheck is + * true. + * + * @param subsetTable name of the subset table + * @param supersetTable name of table to reference + * @param schemaName name of the schema used to determine the primary keys + * @param whereClause condition to check by a SQL WHERE clause, e.g. 'foo > 10' + */ + public void initSubsetWithWhereClause(String subsetTable, String supersetTable, String whereClause, + String schemaName) { + FieldConfig fieldConfig = fieldConfigs.get(schemaName); + + + String stStr = null; + try (CoStoSysConnection conn = obtainOrReserveConnection()) { + if (!whereClause.toUpperCase().startsWith("WHERE")) + whereClause = "WHERE " + whereClause; + + String pkStr = fieldConfig.getPrimaryKeyString(); + + Statement st = conn.createStatement(); + stStr = String.format("INSERT INTO %s (%s) (SELECT %s FROM %s %s);", subsetTable, pkStr, pkStr, + supersetTable, whereClause); + st.execute(stStr); + } catch (SQLException e) { + LOG.error(stStr); + e.printStackTrace(); + } + } + + public void initMirrorSubset(String subsetTable, String supersetTable, boolean performUpdate) throws SQLException { + initMirrorSubset(subsetTable, supersetTable, performUpdate, activeTableSchema); + } + + /** + * Defines a mirror subset populating a subset table with primary keys from + * another table.
+ * Its name is saved into a special meta data table to enable automatic syncing + * (changes to the superset are propagated to the mirror subset). + * + * @param subsetTable name of the subset table + * @param supersetTable name of table to reference + * @throws SQLException + */ + public void initMirrorSubset(String subsetTable, String supersetTable, boolean performUpdate, String schemaName) + throws SQLException { + // TODO if the supersetTable is actually a subset table, we must + // determine the correct schema of the data table which will eventually + // be referenced and create/insert into the mirrorTable there! Currently + // the mirrorTable can be located in the wrong places. + // table listing mirror tables + if (!subsetTable.contains(".")) + subsetTable = dbConfig.getActivePGSchema().concat(".").concat(subsetTable); + + // Create the mirror table list if not existing. + if (!tableExists(Constants.MIRROR_COLLECTION_NAME)) { + List columns = new ArrayList(); + columns.add(Constants.MIRROR_COLUMN_DATA_TABLE_NAME + " text"); + columns.add(Constants.MIRROR_COLUMN_SUBSET_NAME + " text"); + columns.add(Constants.MIRROR_COLUMN_DO_RESET + " boolean DEFAULT true"); + columns.add(String.format("CONSTRAINT %s_pkey PRIMARY KEY (%s)", Constants.MIRROR_COLLECTION_NAME.replace(".", ""), + Constants.MIRROR_COLUMN_SUBSET_NAME)); + createTable(Constants.MIRROR_COLLECTION_NAME, columns, + "This table disposes the names of subset tables which mirror the data table " + supersetTable + + ". These subset tables will be updated as " + supersetTable + + " will obtains updates (insertions as well as deletions)."); + } + // Create the actual subset and fill it to contain all primary key + // values of the data table. + initSubset(subsetTable, supersetTable, schemaName); + + // Add the new subset table to the list of mirror subset tables. + String sql = null; + try (CoStoSysConnection conn = obtainOrReserveConnection()) { + Statement st = conn.createStatement(); + sql = String.format("INSERT INTO %s VALUES ('%s','%s',%b)", Constants.MIRROR_COLLECTION_NAME, supersetTable, subsetTable, + performUpdate); + st.execute(sql); + } catch (SQLException e) { + LOG.error("Error executing SQL command: " + sql, e); + } + } + + /** + * @param tableName table to gather mirror subsets for + * @return names of all mirror subsets for this table + */ + private LinkedHashMap getMirrorSubsetNames(CoStoSysConnection conn, String tableName) { + if (!tableExists(conn, Constants.MIRROR_COLLECTION_NAME)) + return null; + + // The mirror tables are inserted into the collecting table with schema + // information. If the given data table is not qualified, we assume it + // to be in the same postgres scheme as the looked-up mirror subset + // collection table. And that is - for unqualified data tables - the + // active postgres scheme given in the configuration file (see + // 'getMirrorCollectionTableName' on how the mirror subset collection + // table name is determined). + if (!tableName.contains(".")) + tableName = dbConfig.getActivePGSchema() + "." + tableName; + + LinkedHashMap mirrorSubsetList = new LinkedHashMap<>(); + + try { + Statement stmt = conn.createStatement(); + ResultSet rs = stmt.executeQuery(String.format( + "SELECT %s,%s FROM %s WHERE " + Constants.MIRROR_COLUMN_DATA_TABLE_NAME + "='%s'", + Constants.MIRROR_COLUMN_SUBSET_NAME, Constants.MIRROR_COLUMN_DO_RESET, Constants.MIRROR_COLLECTION_NAME, tableName)); + while (rs.next()) { + String mirrorTable = rs.getString(1); + Boolean performUpdate = rs.getBoolean(2); + String refDataTable = getReferencedTable(mirrorTable); + if (refDataTable != null && refDataTable.equals(tableName)) + mirrorSubsetList.put(mirrorTable, performUpdate); + } + } catch (SQLException e) { + e.printStackTrace(); + } + return mirrorSubsetList; + } + + /** + * Sets the values in the is_processed, is_in_process, + * has_errors and log columns of a subset to + * FALSE. + * + * @param subsetTableName name of the subset to reset + */ + public void resetSubset(String subsetTableName) { + resetSubset(subsetTableName, false, false, null); + } + + /** + * Sets the values in the is_processed, is_in_process, + * has_errors and log columns of a subset to + * FALSE where the corresponding rows are + * is_in_process or is_processed. + *

+ * The boolean parameter whereNotProcessed is used for the use case + * where only those rows should be reset that are in_process but + * not is_processed which may happen when a pipeline crashed, a + * document has errors or a pipeline ist just canceled. + *

+ *

+ * In a similar fashion, whereNoErrors resets those rows that have + * no errors. + *

+ *

+ * Both boolean parameters may be combined in which case only non-processed rows + * without errors will be reset. + *

+ * + * @param subsetTableName name of the table to reset unprocessed rows + */ + public void resetSubset(String subsetTableName, boolean whereNotProcessed, boolean whereNoErrors, + String lastComponent) { + String stStr = null; + try (CoStoSysConnection conn = obtainOrReserveConnection()) { + List constraints = new ArrayList<>(); + if (whereNotProcessed) + constraints.add(Constants.IS_PROCESSED + " = FALSE"); + if (whereNoErrors) + constraints.add(Constants.HAS_ERRORS + " = FALSE"); + if (lastComponent != null) + constraints.add(Constants.LAST_COMPONENT + " = '" + lastComponent + "'"); + Statement st = conn.createStatement(); + stStr = String.format( + "UPDATE %s SET %s = FALSE, %s = FALSE, %s='%s', %s = FALSE, %s = NULL, %s = NULL WHERE (%s = TRUE OR %s = TRUE)", + subsetTableName, Constants.IN_PROCESS, Constants.IS_PROCESSED, Constants.LAST_COMPONENT, + DEFAULT_PIPELINE_STATE, Constants.HAS_ERRORS, Constants.LOG, Constants.PROCESSING_TIMESTAMP, + Constants.IS_PROCESSED, Constants.IN_PROCESS); + if (!constraints.isEmpty()) + stStr += " AND " + constraints.stream().collect(Collectors.joining(" AND ")); + st.execute(stStr); + } catch (SQLException e) { + LOG.error("Error executing SQL command: " + stStr, e); + } + } + + /** + * @param subsetTableName + * @param pkValues + * @return + */ + public int[] resetSubset(CoStoSysConnection conn, String subsetTableName, List pkValues) { + return resetSubset(conn, subsetTableName, pkValues, activeTableSchema); + } + + public int[] performBatchUpdate(CoStoSysConnection conn, List pkValues, String sqlFormatString, String schemaName) { + + FieldConfig fieldConfig = fieldConfigs.get(schemaName); + + String stStr = null; + List resultList = new ArrayList<>(); + boolean autoCommit = true; + try { + autoCommit = conn.getAutoCommit(); + conn.setAutoCommit(false); + String whereArgument = StringUtils.join(fieldConfig.expandPKNames("%s = ?"), " AND "); + stStr = String.format(sqlFormatString, whereArgument); + + LOG.trace("Performing batch update with SQL command: {}", stStr); + + PreparedStatement ps = conn.prepareStatement(stStr); + int i = 0; + for (Object[] id : pkValues) { + for (int j = 0; j < id.length; ++j) { + setPreparedStatementParameterWithType(j + 1, ps, id[j], fieldConfig.getPrimaryKey()[j], + fieldConfig); + } + ps.addBatch(); + + if (i >= commitBatchSize) { + int[] results = ps.executeBatch(); + for (int result : results) + resultList.add(result); + conn.commit(); + ps.clearBatch(); + i = 0; + } + ++i; + } + int[] results = ps.executeBatch(); + for (int result : results) + resultList.add(result); + conn.commit(); + + } catch (SQLException e) { + LOG.error("Error executing SQL command: " + stStr, e); + } finally { + try { + conn.setAutoCommit(autoCommit); + } catch (SQLException e) { + LOG.error("Could not set auto commit to its original value", e); + } + } + int[] ret = new int[resultList.size()]; + for (int i = 0; i < ret.length; i++) + ret[i] = resultList.get(i); + return ret; + } + + /** + * Sets the values in the is_processed and + * is_in_process rows of a subset to FALSE. Only + * resets the subset table rows where the primary key equals one of the entries + * in pkValues. + * + * @param subsetTableName - name of the table to reset + * @param pkValues - list of primary keys + * @return + */ + public int[] resetSubset(CoStoSysConnection conn, String subsetTableName, List pkValues, String schemaName) { + // We intentionally do not check whether the rows are already reset + // because we want the only reason for the update to not affect a + // row to be that the row doesn't exist. + // The original where was: 'where (is_processed = TRUE OR + // is_in_process = TRUE) AND %s' + String updateFormatString = "UPDATE " + subsetTableName + " SET " + Constants.IS_PROCESSED + "=FALSE, " + + Constants.IN_PROCESS + "= FALSE, " + Constants.LAST_COMPONENT + "='" + DEFAULT_PIPELINE_STATE +"," + Constants.HOST_NAME + "=NULL" + + "' WHERE %s"; + return performBatchUpdate(conn, pkValues, updateFormatString, schemaName); + } + + public int[] determineExistingSubsetRows(CoStoSysConnection conn, String subsetTableName, List pkValues, String schemaName) { + String updateFormatString = "UPDATE " + subsetTableName + " SET has_errors = has_errors " + "where %s"; + return performBatchUpdate(conn, pkValues, updateFormatString, schemaName); + } + + /** + * @param xmls + * @param tableName + * @param identifier + * @see #importFromXML(Iterable, String, String, String) + */ + public void importFromXML(Iterable xmls, String identifier, String tableName) { + importFromXML(xmls, tableName, identifier, activeTableSchema); + } + + /** + * Imports XMLs into a table. + * + * @param xmls - an Iterator over XMLs as byte[] + * @param tableName - name of the table to import + * @param identifier - used for error messages + */ + public void importFromXML(Iterable xmls, String tableName, String identifier, String schemaName) { + FieldConfig fieldConfig = fieldConfigs.get(schemaName); + + for (byte[] xml : xmls) { + Iterator> it = JulieXMLTools.constructRowIterator(xml, BUFFER_SIZE, + fieldConfig.getForEachXPath(), fieldConfig.getFields(), identifier); + importFromRowIterator(it, tableName); + } + } + + /** + * Import new medline XMLs in a existing table from an XML file or a directory + * of XML files. The XML must be in MEDLINE XML format and can additionally be + * (G)Zipped. + * + * @param fileStr - path to file or directory of (G)Zipped MEDLINE XML file(s) + * @param tableName - name of the target table + * @see #importFromXMLFile(String, String, String) + */ + public void importFromXMLFile(String fileStr, String tableName) { + importFromXMLFile(fileStr, tableName, activeTableSchema); + } + + /** + * Import new medline XMLs in a existing table from an XML file or a directory + * of XML files. The XML must be in MEDLINE XML format and can additionally be + * (G)Zipped. + * + * @param fileStr - path to file or directory of (G)Zipped MEDLINE XML file(s) + * @param tableName - name of the target table + * @param schemaName the table schema to use for the import + */ + public void importFromXMLFile(String fileStr, String tableName, String schemaName) { + LOG.info("Starting import..."); + + FieldConfig fieldConfig = fieldConfigs.get(schemaName); + + String[] fileNames; + File fileOrDir = new File(fileStr); + if (!fileOrDir.isDirectory()) { + fileNames = new String[1]; + fileNames[0] = fileStr; + } else { + fileNames = fileOrDir.list(new FilenameFilter() { + public boolean accept(File arg0, String arg1) { + // TODO write accepted file extensions into configuration + return arg1.endsWith(".zip") || arg1.endsWith(".gz") || arg1.endsWith(".xml"); + } + }); + } + // medline files are sorted chronological + Arrays.sort(fileNames); + XMLPreparer xp = new XMLPreparer(fileOrDir, fieldConfig); + for (String fileName : fileNames) { + LOG.info("Importing " + fileName); + Iterator> it = xp.prepare(fileName); + importFromRowIterator(it, tableName, true, schemaName); + } + } + + /** + * @param fileStr + * @param tableName + * @see #updateFromXML(String, String) + */ + public void updateFromXML(String fileStr, String tableName) { + updateFromXML(fileStr, tableName, activeTableSchema); + } + + /** + * Updates an existing database. If the file contains new entries those are + * inserted, otherwise the table is updated to the version in the file. + * + * @param fileStr - file containing new or updated entries + * @param tableName - table to update + */ + public void updateFromXML(String fileStr, String tableName, String schemaName) { + FieldConfig fieldConfig = fieldConfigs.get(schemaName); + + // TODO deprecated way of determining the primary key fields?! Make sure + // and use appropriate method of FieldConfig. + List pks = new ArrayList(); + List> fields = fieldConfig.getFields(); + for (Map field : fields) + if (field.containsKey("primaryKey")) + if (field.get("primaryKey").equals(true)) + pks.add(field.get("name")); + LOG.info("Starting update..."); + + String[] fileNames; + File fileOrDir = new File(fileStr); + if (!fileOrDir.isDirectory()) { + fileNames = new String[1]; + fileNames[0] = fileStr; + } else { + fileNames = fileOrDir.list(new FilenameFilter() { + public boolean accept(File arg0, String arg1) { + // TODO write accepted file extensions in configuration + // file + return arg1.endsWith(".zip") || arg1.endsWith(".gz") || arg1.endsWith(".xml"); + } + }); + } + + // in medline, the files are ordered chronological + Arrays.sort(fileNames); + XMLPreparer xp = new XMLPreparer(fileOrDir, fieldConfig); + for (String fileName : fileNames) { + LOG.info("Updating from " + fileName); + Iterator> fileIt = xp.prepare(fileName); + updateFromRowIterator(fileIt, tableName, true, schemaName); + } + } + + /** + * @param it + * @param tableName + */ + public void importFromRowIterator(Iterator> it, String tableName) { + importFromRowIterator(it, tableName, true, activeTableSchema); + } + + /** + * @param it + * @param tableName + */ + public void importFromRowIterator(Iterator> it, String tableName, String tableSchema) { + importFromRowIterator(it, tableName, true, tableSchema); + } + + /** + * Internal method to import into an existing table + * + * @param it - an Iterator, yielding rows to insert into the database + * @param tableName - the updated table + * @param commit - if true, the inserted data will be committed in batches + * within this method; no commits will happen otherwise. + * @param schemaName the name of the table schema corresponding to the data table + */ + public void importFromRowIterator(Iterator> it, String tableName, + boolean commit, String schemaName) { + // Fast return to spare some unnecessary communication with the + // database. + if (!it.hasNext()) + return; + + FieldConfig fieldConfig = fieldConfigs.get(schemaName); + + String dataImportStmtString = constructImportStatementString(tableName, fieldConfig); + String mirrorUpdateStmtString = constructMirrorInsertStatementString(fieldConfig); + + boolean wasAutoCommit = true; + + try (CoStoSysConnection conn = obtainOrReserveConnection()) { + wasAutoCommit = conn.getAutoCommit(); + // Get the list of mirror subsets in which all new primary keys must + // be inserted as well. + LinkedHashMap mirrorNames = getMirrorSubsetNames(conn, tableName); + + conn.setAutoCommit(false); + PreparedStatement psDataImport = conn.prepareStatement(dataImportStmtString); + + List mirrorStatements = null; + if (mirrorNames != null) { + mirrorStatements = new ArrayList<>(); + for (String mirror : mirrorNames.keySet()) { + mirrorStatements.add(conn.prepareStatement(String.format(mirrorUpdateStmtString, mirror))); + } + } + List> fields = fieldConfig.getFields(); + int i = 0; + while (it.hasNext()) { + Map row = it.next(); + for (int j = 0; j < fields.size(); j++) { + Map field = fields.get(j); + String fieldName = field.get(JulieXMLConstants.NAME); + setPreparedStatementParameterWithType(j + 1, psDataImport, row.get(fieldName), fieldName, + fieldConfig); + } + psDataImport.addBatch(); + + if (mirrorStatements != null) { + for (PreparedStatement ps : mirrorStatements) { + for (int j = 0; j < fieldConfig.getPrimaryKey().length; j++) { + String fieldName = fieldConfig.getPrimaryKey()[j]; + setPreparedStatementParameterWithType(j + 1, ps, row.get(fieldName), fieldName, + fieldConfig); + } + ps.addBatch(); + } + } + + ++i; + if (i >= commitBatchSize) { + psDataImport.executeBatch(); + if (mirrorStatements != null) + for (PreparedStatement ps : mirrorStatements) + ps.executeBatch(); + // NOTE If a fast return from a commit is required, rather + // use + // Postgres asynchroneous commit + // (http://www.postgresql.org/docs/9.1/static/wal-async-commit.html) + // commit(conn); + if (commit) + conn.commit(); + psDataImport = conn.prepareStatement(dataImportStmtString); + i = 0; + } + } + if (i > 0) { + psDataImport.executeBatch(); + if (commit) + conn.commit(); + if (mirrorStatements != null) + for (PreparedStatement ps : mirrorStatements) + ps.executeBatch(); + // NOTE If a fast return from a commit is required, rather + // use + // Postgres asynchroneous commit + // (http://www.postgresql.org/docs/9.1/static/wal-async-commit.html) + // commit(conn); + if (commit) + conn.commit(); + conn.setAutoCommit(wasAutoCommit); + } + } catch (SQLException e) { + LOG.error("SQLException while trying to insert: ", e); + SQLException nextException = e.getNextException(); + if (nextException != null) { + LOG.error("Next exception: ", nextException); + } + throw new CoStoSysSQLRuntimeException(e); + } finally { + try { + if (commitThread != null) + commitThread.join(); + } catch (InterruptedException e) { + throw new CoStoSysRuntimeException(e); + } + } + } + + /** + *

+ * Updates a table with the entries yielded by the iterator. If the entries is + * not yet in the table, it will be inserted instead. + *

+ *

+ * The input rows are expected to fit the active table schema. + *

+ * + * @param it - an Iterator, yielding new or updated entries. + * @param tableName - the updated table + */ + public void updateFromRowIterator(Iterator> it, String tableName) { + updateFromRowIterator(it, tableName, true, activeTableSchema); + } + + /** + *

+ * Updates a table with the entries yielded by the iterator. If the entries is + * not yet in the table, it will be inserted instead. + *

+ *

+ * The input rows are expected to fit the table schema schemaName. + *

+ * + * @param it + * - an Iterator, yielding new or updated entries. + * @param tableName + * - the updated table + */ + + /** + *

+ * Updates a table with the entries yielded by the iterator. If the entries is + * not yet in the table, it will be inserted instead. + *

+ *

+ * The input rows are expected to fit the table schema schemaName. + * + * @param it - an Iterator, yielding new or updated entries. + * @param tableName - the updated table + * @param commit - if true, the updated data will be committed in batches + * within this method; nothing will be commit otherwise. + * @param schemaName the name of the table schema corresponding to the updated data + * table + */ + public void updateFromRowIterator(Iterator> it, String tableName, + boolean commit, String schemaName) { + // Fast return to avoid unnecessary communication with the database. + if (!it.hasNext()) + return; + + FieldConfig fieldConfig = fieldConfigs.get(schemaName); + + String statementString = constructUpdateStatementString(tableName, fieldConfig); + String mirrorInsertStmtString = constructMirrorInsertStatementString(fieldConfig); + + // this is just a default value in case the next line throws an exception + boolean wasAutoCommit = true; + try (CoStoSysConnection conn = obtainOrReserveConnection()) { + wasAutoCommit = conn.getAutoCommit(); + LOG.trace("Retrieving mirror subsets of table {}", tableName); + LinkedHashMap mirrorNames = getMirrorSubsetNames(conn, tableName); + + List mirrorStatements = null; + if (mirrorNames != null) { + mirrorStatements = new ArrayList<>(); + for (String mirror : mirrorNames.keySet()) { + mirrorStatements.add(conn.prepareStatement(String.format(mirrorInsertStmtString, mirror))); + } + } + + final int sliceSize = 10000; + LOG.trace("Reading update data slice up to {} documents. Within this slice, duplicate document IDs will be handled by only taking the last document into account.", sliceSize); + + String[] primaryKey = fieldConfig.getPrimaryKey(); + // This is an outer loop to help us cut the documents we get from the iterator in slices. This is very + // useful or even required when reading large archives from a single iterator. + while (it.hasNext()) { + // This map will assemble for each primary key only the NEWEST (in + // XML the latest in Medline) row. Its size is an approximation of + // Medline blob XML files. + // TODO we should actually check for the PMID version and take the highest + Map> rowsByPk = new HashMap<>(); + while (it.hasNext() && rowsByPk.size() < sliceSize) { + Map row = it.next(); + StringBuilder rowPrimaryKey = new StringBuilder(); + for (int j = 0; j < primaryKey.length; j++) { + String keyFieldName = primaryKey[j]; + Object key = row.get(keyFieldName); + rowPrimaryKey.append(key); + + } + String pk = rowPrimaryKey.toString(); + rowsByPk.put(pk, row); + } + + PreparedStatement ps = conn.prepareStatement(statementString); + List> fields = fieldConfig.getFields(); + List> cache = new ArrayList<>(commitBatchSize); + int i = 0; + for (Map row : rowsByPk.values()) { + + for (int j = 0; j < fields.size() + primaryKey.length; j++) { + if (j < fields.size()) { + Map field = fields.get(j); + String fieldName = field.get(JulieXMLConstants.NAME); + setPreparedStatementParameterWithType(j + 1, ps, row.get(fieldName), null, null); + } else { + String key = primaryKey[j - fields.size()]; + Object keyValue = row.get(key); + setPreparedStatementParameterWithType(j + 1, ps, keyValue, null, null); + } + } + ps.addBatch(); + cache.add(row); + + ++i; + if (i >= commitBatchSize) { + LOG.trace("Committing batch of size {}", i); + executeAndCommitUpdate(tableName, conn, commit, schemaName, fieldConfig, mirrorNames, + mirrorStatements, ps, cache); + cache.clear(); + i = 0; + } + } + if (i > 0) { + LOG.trace("Committing last batch of size {}", i); + executeAndCommitUpdate(tableName, conn, commit, schemaName, fieldConfig, mirrorNames, + mirrorStatements, ps, cache); + } + conn.setAutoCommit(wasAutoCommit); + } + } catch (SQLException e) { + LOG.error( + "SQL error while updating table {}. Database configuration is: {}. Table schema configuration is: {}", + tableName, dbConfig, fieldConfig, e); + SQLException nextException = e.getNextException(); + if (null != nextException) { + LOG.error("Next exception was: ", nextException); + } + throw new CoStoSysSQLRuntimeException(e); + } finally { + try { + if (commitThread != null) + commitThread.join(); + + } catch (InterruptedException e) { + throw new CoStoSysRuntimeException(e); + } + } + } + + /** + * Performs the actual update in the database. Additionally manages the + * appropriate reset of rows in mirror subsets and the addition of missing rows + * in mirror subsets. + * + * @param tableName + * @param externalConn + * @param commit + * @param schemaName + * @param fieldConfig + * @param mirrorNames + * @param mirrorStatements + * @param ps + * @param cache + * @throws SQLException + */ + private void executeAndCommitUpdate(String tableName, CoStoSysConnection externalConn, boolean commit, String schemaName, + FieldConfig fieldConfig, LinkedHashMap mirrorNames, + List mirrorStatements, PreparedStatement ps, List> cache) + throws SQLException { + boolean wasAutoCommit = externalConn.getAutoCommit(); + try { + externalConn.setAutoCommit(false); + int[] returned = ps.executeBatch(); + + List> toInsert = new ArrayList<>(commitBatchSize); + List> toResetRows = new ArrayList<>(commitBatchSize); + List toResetPKs = new ArrayList<>(); + + fillUpdateLists(cache, returned, toInsert, toResetPKs, toResetRows, fieldConfig); + importFromRowIterator(toInsert.iterator(), tableName, commit, schemaName); + // Do a commit to end the transaction. This is sometimes even necessary + // because following transactions would be blocked otherwise. + LOG.trace("Committing updates to the data table."); + externalConn.commit(); + if (mirrorNames != null) { + LOG.trace("Applying updates to mirror subsets:"); + List> toInsertMirror = new ArrayList<>(commitBatchSize); + Iterator mirrorNamesIt = mirrorNames.keySet().iterator(); + Iterator mirrorStatementsIt = mirrorStatements.iterator(); + for (int j = 0; j < mirrorNames.size(); j++) { + String mirrorName = mirrorNamesIt.next(); + LOG.trace("Applying to mirror subset \"{}\"", mirrorName); + // The mirrorNames hashmap has as values booleans telling + // whether to reset a mirror table or not. If not, we still want + // to know whether there are any missing rows and insert them. + if (mirrorNames.get(mirrorName)) { + LOG.trace("Resetting updated rows."); + returned = resetSubset(externalConn, mirrorName, toResetPKs, schemaName); + } else { + LOG.trace("Updates rows are NOT reset."); + returned = determineExistingSubsetRows(externalConn, mirrorName, toResetPKs, schemaName); + } + // Possibly some update documents don't even exist + // in a mirror subset. This shouldn't happen of + // course, but it might due to errors. This allows + // to repair the error by an update instead of + // deleting the missing data from the data table and + // re-import it. + fillUpdateLists(toResetRows, returned, toInsertMirror, null, null, fieldConfig); + if (toInsertMirror.size() > 0) { + LOG.trace("{} updated rows where not found in this mirror subset. They will be added"); + // The mirror insert statements are a parallel list + // to mirrorNames, thus the jth mirrorName belong to + // the jth insert statement. + PreparedStatement mirrorPS = mirrorStatementsIt.next(); + for (Map missingMirrorRow : toInsertMirror) { + for (int k = 0; k < fieldConfig.getPrimaryKey().length; k++) { + String fieldName = fieldConfig.getPrimaryKey()[k]; + setPreparedStatementParameterWithType(k + 1, mirrorPS, missingMirrorRow.get(fieldName), + fieldName, fieldConfig); + } + mirrorPS.addBatch(); + } + mirrorPS.executeBatch(); + toInsertMirror.clear(); + } else { + LOG.trace("All updated rows exist in the mirror subset."); + } + } + } + + if (commit) { + LOG.trace("Committing updates."); + externalConn.commit(); + } + } finally { + externalConn.setAutoCommit(wasAutoCommit); + } + } + + /** + *

+ * Prepares lists of documents to insert into a table and primary keys for which + * mirror subsets must be reseted because the respective documents in the data + * table have been updated. The preparation happens basing on the return value + * of an SQL operation trying to operate on a set of documents, e.g. updating + * them. A batch UPDATE command, for instance, returns an int[] where for each + * batch item 0 indicates non-success (could not be updated, presumably because + * the primary key in the update command does not exist) and 1 indicates + * success.
+ * Successful updated documents must be reseted in the mirror subsets, documents + * that could not be updated (and thus don't exist) must be inserted. + *

+ * + * @param cache Input: The list of rows for which the original SQL command was + * issued that returned the values in returned. Must be + * parallel to returned. + * @param returned Input: The return values of the SQL command issued on base of the + * rows contained in cache. + * @param toInsert Output: Rows from cache filtered by "corresponding value + * in returned was <= 0 (non-success)". + * @param toResetPKs Output: Primary keys from cache rows for which + * returned holds a value >0 (e.g. successful update). + * @param toResetRows Output, may be null: The rows from cache for which + * returned holds a value >0. + * @param fieldConfig Input: Field configuration to determine the correct primary key. + */ + private void fillUpdateLists(List> cache, int[] returned, List> toInsert, + List toResetPKs, List> toResetRows, FieldConfig fieldConfig) { + for (int j = 0; j < returned.length; ++j) { + Map newRow = cache.get(j); + if (returned[j] <= 0) { + toInsert.add(newRow); + } else { + if (null != toResetPKs) { + Object[] pkValues = new Object[fieldConfig.getPrimaryKey().length]; + for (int k = 0; k < pkValues.length; k++) { + String pkColumn = fieldConfig.getPrimaryKey()[k]; + pkValues[k] = newRow.get(pkColumn); + } + toResetPKs.add(pkValues); + } + if (null != toResetRows) + toResetRows.add(newRow); + } + } + } + + /** + * Creates an SQL-template, usable in prepared statements which add new values + * into a table + * + * @param fieldConfig - used to get the primary key, as the template must contain it + * @return - an SQL string for inserting, containing a '?' for every primary key + * and a %s for the table name + */ + private String constructMirrorInsertStatementString(FieldConfig fieldConfig) { + String stmtTemplate = "INSERT INTO %s (%s) VALUES (%s)"; + String pkStr = fieldConfig.getPrimaryKeyString(); + String[] wildCards = new String[fieldConfig.getPrimaryKey().length]; + for (int i = 0; i < wildCards.length; i++) + wildCards[i] = "?"; + String wildCardStr = StringUtils.join(wildCards, ","); + return String.format(stmtTemplate, "%s", pkStr, wildCardStr); + } + + /** + * Constructs an SQL prepared statement for import of data rows into the + * database table tableName according to the field schema + * definition. + * + * Example: + *

+ * If the field schema contains two rows 'pmid' and 'xml', the statement + * expressions expects all these rows to be filled. The resulting String will be + * + *

INSERT INTO (pmid,xml) VALUES (?,?)
+ * + * @param tableName Name of the database table to import data into. + * @param fieldDefinition A {@link FieldConfig} object determining the rows to be imported. + * @return An SQL prepared statement string for import of data into the table. + */ + private String constructImportStatementString(String tableName, FieldConfig fieldDefinition) { + String stmtTemplate = "INSERT INTO %s (%s) VALUES (%s)"; + List> fields = fieldDefinition.getFields(); + StringBuilder columnsStrBuilder = new StringBuilder(); + StringBuilder valuesStrBuilder = new StringBuilder(); + for (int i = 0; i < fields.size(); ++i) { + columnsStrBuilder.append(fields.get(i).get(JulieXMLConstants.NAME)); + if (fields.get(i).get(JulieXMLConstants.TYPE).equals("xml")) + valuesStrBuilder.append("XMLPARSE(CONTENT ?)"); + else + valuesStrBuilder.append("?"); + if (i < fields.size() - 1) { + columnsStrBuilder.append(","); + valuesStrBuilder.append(","); + } + } + return String.format(stmtTemplate, tableName, columnsStrBuilder.toString(), valuesStrBuilder.toString()); + } + + + /** + * Constructs an SQL prepared statement for updating data rows in the database + * table tableName according to the field schema definition. + * + * Example: + *

+ * If the field schema contains two rows ('pmid' and 'xml') and pmid is primary + * key, the resulting String will be + * + *

UPDATE SET pmid=?, xml=? WHERE pmid=?
+ * + * @param tableName Name of the database table to import data into. + * @param fieldDefinition A {@link FieldConfig} object determining the rows to be imported. + * @return An SQL prepared statement string for import of data into the table. + */ + private String constructUpdateStatementString(String tableName, FieldConfig fieldDefinition) { + String stmtTemplate = "UPDATE %s SET %s WHERE %s"; + List> fields = fieldDefinition.getFields(); + StringBuilder newValueStrBuilder = new StringBuilder(); + for (int i = 0; i < fields.size(); ++i) { + newValueStrBuilder.append(fields.get(i).get(JulieXMLConstants.NAME)); + if (fields.get(i).get(JulieXMLConstants.TYPE).equals("xml")) + newValueStrBuilder.append("=XMLPARSE(CONTENT ?)"); + else + newValueStrBuilder.append("=?"); + if (i < fields.size() - 1) + newValueStrBuilder.append(","); + } + String[] primaryKeys = fieldDefinition.getPrimaryKey(); + StringBuilder conditionStrBuilder = new StringBuilder(); + for (int i = 0; i < primaryKeys.length; ++i) { + String key = primaryKeys[i]; + conditionStrBuilder.append(key).append("=?"); + if (i < primaryKeys.length - 1) + conditionStrBuilder.append(" AND "); + } + String statementString = String.format(stmtTemplate, tableName, newValueStrBuilder.toString(), + conditionStrBuilder.toString()); + LOG.trace("PreparedStatement update command: {}", statementString); + return statementString; + } + + /** + * Alters an table, executing the supplied action + * + * @param action - SQL fragment, specifiying how to alter the table + * @param tableName - table to alter + */ + private void alterTable(String action, String tableName) { + + String sqlString = "ALTER TABLE " + tableName + " " + action; + try (CoStoSysConnection conn = obtainOrReserveConnection()) { + Statement st = conn.createStatement(); + st.execute(sqlString); + } catch (SQLException e) { + e.printStackTrace(); + } + } + + /** + * @param ids + * @param table + * @param timestamp + * @return + * @see #queryWithTime(List, String, String, String) + */ + public DBCIterator queryWithTime(List ids, String table, String timestamp) { + return queryWithTime(ids, table, timestamp, activeTableSchema); + } + + /******************************** + * Data Retrieval + ****************************************************************************************************/ + /* + * Speed: (tested by repeated queries, using a pool-pc and 1000 as batchSize) + * queryAll() fetched 8.5 documents/ms (33min for whole db with 16.9*10e6 + * documents) query(ids) fetched 9.3 documents/ms (9.3sec for 10e5 documents of + * a PMID sample) + */ + + /** + * Returns an iterator over all rows in the table with matching id and a + * timestamp newer (>) than timestamp. The Iterator will use + * threads, memory and a connection until all matches are returned. + * + * @param ids - List with primary keys + * @param table - table to query + * @param timestamp - timestamp (only rows with newer timestamp are returned) + * @return - pmid and xml as an Iterator + */ + public DBCIterator queryWithTime(List ids, String table, String timestamp, String schemaName) { + FieldConfig fieldConfig = fieldConfigs.get(schemaName); + String timestampWhere = fieldConfig.getTimestampFieldName() + " > " + timestamp; + return new ThreadedColumnsToRetrieveIterator(this, ids, table, timestampWhere, schemaName); + } + + /** + * Returns an iterator over the column field in the table + * table. NOTE: The Iterator will use threads, memory and a + * connection until the iterator is empty, i.e. hasNext() returns + * null! + * + * @param fields - field to return + * @param table - table to query + * @return - results as an Iterator + */ + public DBCIterator queryAll(List fields, String table) { + return new ThreadedColumnsIterator(this, fields, table); + } + + /** + * Returns the requested fields from the requested table. The iterator must be fully consumed or dangling threads + * and connections will remain, possible causing the application to wait forever for an open connection. + * + * @param table The table to query. + * @param fields The names of the columns to retrieve values from. + * @return An iterator over the requested columns values. + */ + public DBCIterator query(String table, List fields) { + return new ThreadedColumnsIterator(this, fields, table); + } + + /** + * Returns the requested fields from the requested table. The iterator must be fully consumed or dangling threads + * and connections will remain, possible causing the application to wait forever for an open connection. + * + * @param table The table to query. + * @param fields The names of the columns to retrieve values from. + * @param limit A limit of documents to retrieve. + * @return An iterator over the requested columns values. + */ + public DBCIterator query(String table, List fields, long limit) { + return new ThreadedColumnsIterator(this, fields, table, limit); + } + + /** + * Returns the values the the column {@link #DEFAULT_FIELD} in the given table. + * The Iterator will use threads, memory and a connection until all matches were + * returned. + * + * @param keys + * @param table + * @return + * @see #query(List, String, String) + */ + public DBCIterator query(List keys, String table) { + return new ThreadedColumnsIterator(this, keys, Collections.singletonList(DEFAULT_FIELD), table, activeTableSchema); + } + + /** + * Returns the values the the column {@link #DEFAULT_FIELD} in the given table. The + * Iterator will use threads, memory and a connection until all matches were + * returned. + * + * @param keys - list of String[] containing the parts of the primary key + * @param table - table to query + * @return - results as an Iterator + */ + public DBCIterator query(List keys, String table, String schemaName) { + return new ThreadedColumnsIterator(this, keys, Collections.singletonList(DEFAULT_FIELD), table, schemaName); + } + + /** + * Retrieves row values of table from the database. The returned columns are those + * that are configuration to be retrieved in the active table schema. + * + * @param ids + * @param table + * @return + * @see #retrieveColumnsByTableSchema(List, String, String) + */ + public DBCIterator retrieveColumnsByTableSchema(List ids, String table) { + return retrieveColumnsByTableSchema(ids, table, activeTableSchema); + } + + /** + * Retrieves row values of table from the database. The returned columns are those + * that are configuration to be retrieved in the table schema with name schemaName. + * + * @param ids + * @param table + * @param schemaName + * @return + */ + public DBCIterator retrieveColumnsByTableSchema(List ids, String table, String schemaName) { + return new ThreadedColumnsToRetrieveIterator(this, ids, table, schemaName); + } + + /** + * Retrieves data from the database over multiple tables. All tables will be joined on the given IDs. + * The columns to be retrieved for each table is determined by its table schema. For this purpose, the + * tables and schemaName arrays are required to be parallel. + * + * @param ids A list of primary keys identifying the items to retrieve. + * @param tables The tables from which the items should be retrieved that are identified by ids. + * @param schemaNames A parallel array to tables thas specifies the table schema name of each table. + * @return The joined data from the requested tables. + */ + public DBCIterator retrieveColumnsByTableSchema(List ids, String[] tables, String[] schemaNames) { + return new ThreadedColumnsToRetrieveIterator(this, ids, tables, schemaNames); + } + + /** + *

+ * Returns all column data from the data table tableName which is + * marked as 'to be retrieved' in the table scheme specified by the active table + * scheme. + *

+ *

+ * For more specific information, please refer to + * {@link #queryDataTable(String, String, String)}. + *

+ * + * @param tableName Name of a data table. + * @param whereCondition Optional additional specifications for the SQL "SELECT" statement. + * @see #queryDataTable(String, String, String) + */ + public DBCIterator queryDataTable(String tableName, String whereCondition) { + return queryDataTable(tableName, whereCondition, activeTableSchema); + } + + /** + *

+ * Returns all column data from the data table tableName which is + * marked as 'to be retrieved' in the table scheme specified by + * schemaName. + *

+ *

+ * This method offers direct access to the table data by using an SQL + * ResultSet in cursor mode, allowing for queries leading to large + * results. + *

+ *

+ * An optional where clause (actually everything behind the "FROM" in the SQL + * select statement) may be passed to restrict the columns being returned. All + * specifications are allowed which do not alter the number of columns returned + * (like "GROUP BY"). + *

+ * + * @param tableName Name of a data table. + * @param whereCondition Optional additional specifications for the SQL "SELECT" statement. + * @param schemaName The table schema name to determine which columns should be + * retrieved. // * @return An iterator over byte[][] . + * Each returned byte array contains one nested byte array for each + * retrieved column, holding the column's data in a sequence of + * bytes. + */ + public DBCIterator queryDataTable(String tableName, String whereCondition, String schemaName) { + if (!withConnectionQueryBoolean(c -> c.tableExists(tableName))) + throw new IllegalArgumentException("Table \"" + tableName + "\" does not exist."); + + final FieldConfig fieldConfig = fieldConfigs.get(schemaName); + + // Build the correct query. + String query = null; + String selectedColumns = StringUtils.join(fieldConfig.getColumnsToRetrieve(), ","); + // prepend there WHERE keyword if not already present and if we don't + // actually have only a LIMIT constraint + if (whereCondition != null && !whereCondition.trim().toUpperCase().startsWith("WHERE") + && !whereCondition.trim().toUpperCase().matches("LIMIT +[0-9]+")) + query = String.format("SELECT %s FROM %s WHERE %s", selectedColumns, tableName, whereCondition); + else if (whereCondition != null) + query = String.format("SELECT %s FROM %s %s", selectedColumns, tableName, whereCondition); + else + query = String.format("SELECT %s FROM %s", selectedColumns, tableName); + final String finalQuery = query; + + try { + + DBCIterator it = new DBCIterator() { + + private CoStoSysConnection conn = reserveConnection(); + private ResultSet rs = doQuery(conn); + private boolean hasNext = rs.next(); + + private ResultSet doQuery(CoStoSysConnection conn) throws SQLException { + // Get a statement which is set to cursor mode. The data + // table could + // be really large and we don't have the two fold process + // here where + // first we get IDs from a subset and then only the actual + // documents + // for these IDs. + conn.setAutoCommit(false); + Statement stmt = conn.createStatement(); + stmt.setFetchSize(queryBatchSize); + return stmt.executeQuery(finalQuery); + } + + @Override + public boolean hasNext() { + if (!hasNext) + close(); + return hasNext; + } + + @Override + public byte[][] next() { + if (hasNext) { + List> fields = fieldConfig.getFields(); + try { + byte[][] retrievedData = new byte[fieldConfig.getColumnsToRetrieve().length][]; + for (int i = 0; i < retrievedData.length; i++) { + retrievedData[i] = rs.getBytes(i + 1); + if (Boolean.parseBoolean(fields.get(i).get(JulieXMLConstants.GZIP))) + retrievedData[i] = JulieXMLTools.unGzipData(retrievedData[i]); + } + hasNext = rs.next(); + if (!hasNext) + close(); + return retrievedData; + } catch (SQLException | IOException e) { + hasNext = false; + e.printStackTrace(); + } + } + return null; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + + @Override + public void close() { + conn.close(); + } + }; + + return it; + } catch (SQLException e) { + LOG.error("Error while executing SQL statement \"" + finalQuery + "\""); + e.printStackTrace(); + } + + return null; + } + + /** + * @param tableName + * @param limitParam + * @return + * @throws SQLException + */ + public DBCIterator querySubset(String tableName, long limitParam) throws SQLException { + return querySubset(tableName, null, limitParam, 0, activeTableSchema); + } + + public int getQueryBatchSize() { + return queryBatchSize; + } + + public void setQueryBatchSize(int queryBatchSize) { + this.queryBatchSize = queryBatchSize; + } + + /** + *

+ * Retrieves XML field values in the data table referenced by the subset table + * tableName or tableName itself if it is a data + * table. + *

+ *

+ * The method always first retrieves a batch of primary keys from the subset + * table and then gets the actual documents from the data table (necessary for + * the data table - subset paradigm). As this is unnecessary when querying + * directly from a data table, for that kind of queries this method calls + * {@link #queryDataTable(String, String, String)}. + *

+ *

+ * The number of returned documents is restricted in number by + * limitParam. All documents are returned if + * limitParam is of negative value.
+ * Note: Of course, whereClause could already contain an SQL + * 'LIMIT' specification. However, I won't work as expected since this limit + * expression would be applied to each batch of subset-IDs which is used to + * query the data table. Using the limitParam parameter will assure + * you get at most as much documents from the iterator as specified. If + * tableName denotes a data table and whereClause does + * not already contain a 'LIMIT' expression, limitParam will be + * added to whereClause for the subsequent call to + * queryDataTable. + *

+ * + * @param tableName Subset table determining which documents to retrieve from the data + * table; may also be a data table itself. + * @param whereClause An SQL where clause restricting the returned columns of each + * queried subset-ID batch. This clause must not change the rows + * returned (e.g. by 'GROUP BY'). + * @param limitParam Number restriction of documents to return. + * @param numberRefHops + * @param schemaName The name of table schema of the referenced data table. + * @return An iterator returning documents references from or in the table + * tableName. + * @throws SQLException + * @see #queryDataTable(String, String, String) + */ + public DBCIterator querySubset(final String tableName, final String whereClause, final long limitParam, + Integer numberRefHops, final String schemaName) throws SQLException { + if (!withConnectionQueryBoolean(c -> c.tableExists(tableName))) + throw new IllegalArgumentException("Table \"" + tableName + "\" does not exist."); + + final FieldConfig fieldConfig = fieldConfigs.get(schemaName); + final String dataTable = withConnectionQueryString(c -> c.getReferencedTable(tableName, numberRefHops)); + if (dataTable.equals(tableName)) { + String newWhereClause = whereClause; + if (newWhereClause == null && limitParam > 0) + newWhereClause = ""; + // For the current method, limit must be given explicitly. Not so + // for querying a single table like the data table. If the + // whereClause not already contains a LIMIT expression, we just add + // it corresponding to the limit parameter. + if (limitParam > 0 && !newWhereClause.toLowerCase().matches(".*limit +[0-9]+.*")) + newWhereClause += " LIMIT " + limitParam; + return queryDataTable(tableName, newWhereClause, schemaName); + } + + + try (final CoStoSysConnection conn = obtainOrReserveConnection()) { + // We will set the key-retrieval-statement below to cursor mode by + // specifying a maximum number of rows to return; for this to work, + // auto commit must be turned off. + conn.setAutoCommit(false); + final Statement stmt = conn.createStatement(); + // Go to cursor mode by setting a fetch size. + stmt.setFetchSize(queryBatchSize); + // As we want to query the whole subset/data table, just get a + // cursor over all IDs in the set. + String sql = "SELECT (" + fieldConfig.getPrimaryKeyString() + ") FROM " + tableName; + final ResultSet outerKeyRS = stmt + .executeQuery(sql); + final DataBaseConnector dbc = this; + + // We need to keep the connection open until the iterator has finished. It will close the connection + // when all items have been returned, effectively decreasing the usage level of the CoStoSysConnection. + conn.incrementUsageNumber(); + DBCIterator it = new DBCIterator() { + + private long returnedDocs = 0; + private ResultSet keyRS = outerKeyRS; + private long limit = limitParam <= 0 ? Long.MAX_VALUE : limitParam; + private Iterator xmlIt; + + @Override + public boolean hasNext() { + if (returnedDocs >= limit) + return false; + + try { + if (xmlIt == null || !xmlIt.hasNext()) { + int currentBatchSize = 0; + List ids = new ArrayList(); + String[] pks = fieldConfig.getPrimaryKey(); + while (currentBatchSize < queryBatchSize && keyRS.next()) { + String[] values = new String[pks.length]; + for (int i = 0; i < pks.length; i++) { + values[i] = (String) keyRS.getObject(i + 1); + } + ids.add(values); + ++currentBatchSize; + } + if (whereClause != null) + xmlIt = new ThreadedColumnsToRetrieveIterator(dbc, conn, ids, dataTable, whereClause, schemaName); + else + xmlIt = new ThreadedColumnsToRetrieveIterator(dbc, conn, ids, dataTable, schemaName); + + boolean xmlItHasNext = xmlIt.hasNext(); + if (!xmlItHasNext) + close(); + + return xmlItHasNext; + } + } catch (SQLException e) { + e.printStackTrace(); + } + return true; + } + + @Override + public byte[][] next() { + if (!hasNext()) { + close(); + return null; + } + ++returnedDocs; + return xmlIt.next(); + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + + @Override + public void close() { + conn.close(); + } + + }; + + return it; + } catch (SQLException e) { + e.printStackTrace(); + } + return null; + } + + /** + * Helper method to determine the columns that are returned in case of a joining operation. Returns the number of + * returned fields and the according field definitions. If joined is set to false, only the + * first table and the first schema is taken into account. + * + * @param joined Whether the data is joined. + * @param schemaNames The names of the table schemas of the tables that are read. From the respective table schemas, + * the columns that are marked to be retrieved, are extracted. + * @return A pair holding the number of retrieved columns and those columns themselves. + */ + public Pair>> getNumColumnsAndFields(boolean joined, String[] schemaNames) { + int numColumns = 0; + List> fields = new ArrayList<>(); + if (!joined) { + FieldConfig fieldConfig = fieldConfigs.get(schemaNames[0]); + numColumns = fieldConfig.getColumnsToRetrieve().length; + fields = fieldConfig.getFields(); + } else { + for (int i = 0; i < schemaNames.length; i++) { + FieldConfig fieldConfig = fieldConfigs.get(schemaNames[i]); + int num = fieldConfig.getColumnsToRetrieve().length; + numColumns = numColumns + num; + List> fieldsPartly = fieldConfig.getFieldsToRetrieve(); + fields.addAll(fieldsPartly); + } + } + return new ImmutablePair<>(numColumns, fields); + } + + /** + * Returns the row count of the requested table. + * + * @param tableName The table to count the rows of. + * @return The table row count. + */ + public long getNumRows(String tableName) { + + try (CoStoSysConnection conn = obtainOrReserveConnection()) { + String sql = String.format("SELECT sum(1) as %s FROM %s", Constants.TOTAL, tableName); + ResultSet resultSet = conn.createStatement().executeQuery(sql); + if (resultSet.next()) { + return resultSet.getLong(Constants.TOTAL); + } + } catch (SQLException e) { + LOG.error("Error when trying to determine size of table {}: {}", tableName, e); + } + return 0; + } + + /** + * Returns a map with information about how many rows are marked as + * is_in_process, is_processed and how many rows there are in + * total.
+ * The respective values are stored under with the keys + * {@link Constants#IN_PROCESS}, {@link Constants#PROCESSED} and + * {@link Constants#TOTAL}. + * + * @param subsetTableName name of the subset table to gain status information for + * @return A SubsetStatus instance containing status information about the + * subset table subsetTableName + * @throws TableNotFoundException If subsetTableName does not point to a database table. + */ + public SubsetStatus status(String subsetTableName, Set statusElementsToReturn) throws TableNotFoundException { + if (!tableExists(subsetTableName)) + throw new TableNotFoundException("The subset table \"" + subsetTableName + "\" does not exist."); + + SubsetStatus status = new SubsetStatus(); + + try (CoStoSysConnection conn = obtainOrReserveConnection()) { + StringJoiner joiner = new StringJoiner(","); + String sumFmtString = "sum(case when %s=TRUE then 1 end) as %s"; + if (statusElementsToReturn.contains(StatusElement.HAS_ERRORS)) + joiner.add(String.format(sumFmtString, Constants.HAS_ERRORS, Constants.HAS_ERRORS)); + if (statusElementsToReturn.contains(StatusElement.IS_PROCESSED)) + joiner.add(String.format(sumFmtString, Constants.IS_PROCESSED, Constants.IS_PROCESSED)); + if (statusElementsToReturn.contains(StatusElement.IN_PROCESS)) + joiner.add(String.format(sumFmtString, Constants.IN_PROCESS, Constants.IN_PROCESS)); + if (statusElementsToReturn.contains(StatusElement.TOTAL)) + joiner.add(String.format("sum(1) as %s", Constants.TOTAL)); + String sql = String.format( + "SELECT " + joiner.toString() + " FROM %s", subsetTableName); + Statement stmt = conn.createStatement(); + { + ResultSet res = stmt.executeQuery(sql); + if (res.next()) { + if (statusElementsToReturn.contains(StatusElement.HAS_ERRORS)) + status.hasErrors = res.getLong(Constants.HAS_ERRORS); + if (statusElementsToReturn.contains(StatusElement.IN_PROCESS)) + status.inProcess = res.getLong(Constants.IN_PROCESS); + if (statusElementsToReturn.contains(StatusElement.IS_PROCESSED)) + status.isProcessed = res.getLong(Constants.IS_PROCESSED); + if (statusElementsToReturn.contains(StatusElement.TOTAL)) + status.total = res.getLong(Constants.TOTAL); + } + } + + if (statusElementsToReturn.contains(StatusElement.LAST_COMPONENT)) { + SortedMap pipelineStates = new TreeMap<>(); + status.pipelineStates = pipelineStates; + String pipelineStateSql = String.format("SELECT %s,count(%s) from %s group by %s", + Constants.LAST_COMPONENT, Constants.LAST_COMPONENT, subsetTableName, Constants.LAST_COMPONENT); + ResultSet res = stmt.executeQuery(pipelineStateSql); + while (res.next()) + pipelineStates.put(res.getString(1) != null ? res.getString(1) : "", res.getLong(2)); + } + } catch (SQLException e) { + e.printStackTrace(); + } + + return status; + } + + /** + * @return - all tables in the active scheme + */ + public List getTables() { + ArrayList tables = new ArrayList(); + try (CoStoSysConnection conn = obtainOrReserveConnection()) { + ResultSet res = conn.getMetaData().getTables(null, dbConfig.getActivePGSchema(), null, + new String[]{"TABLE"}); + while (res.next()) + tables.add(res.getString("TABLE_NAME")); + } catch (SQLException e) { + e.printStackTrace(); + } + return tables; + } + + /** + * Query the MetaData for the columns of a table + * + * @param tableName - the table + * @return - List of String containing name and type of each column + */ + public List getTableDefinition(String tableName) { + try (CoStoSysConnection conn = obtainOrReserveConnection()) { + ArrayList columns = new ArrayList(); + String schema; + if (tableName.contains(".")) { + schema = tableName.split("\\.")[0]; + tableName = tableName.split("\\.")[1]; + } else + schema = dbConfig.getActivePGSchema(); + try { + ResultSet res = conn.getMetaData().getColumns(null, schema, tableName, null); + // ERIK 6th of December 2013: Removed the type information because + // it lead to false positives: When the + // dbcConfiguration specifies an "integer", it actually becomes an + // "int4". This could be treated, for the + // moment + // only the names will be checked. + while (res.next()) + // columns.add(res.getString("COLUMN_NAME") + " " + + // res.getString("TYPE_NAME")); + columns.add(res.getString("COLUMN_NAME")); + } catch (SQLException e) { + e.printStackTrace(); + } + return columns; + } + } + + /** + * @return - the active Postgres scheme + */ + public String getScheme() { + String scheme = "none"; + try (CoStoSysConnection conn = obtainOrReserveConnection()){ + ResultSet res = conn.createStatement().executeQuery("SHOW search_path;"); + if (res.next()) + scheme = res.getString(1); + } catch (SQLException e) { + e.printStackTrace(); + } + return scheme; + } + + /******************************* + * Classes for query() + *******************************************/ + + /** + * @return the active field configuration + */ + public FieldConfig getFieldConfiguration() { + return fieldConfigs.get(activeTableSchema); + } + + public void addFieldConfiguration(FieldConfig config) { + fieldConfigs.put(config.getName(), config); + } + + /** + * @param schemaName The name of the schema for which the eventual + * FieldConfig should be returned. + * @return The field configuration for schemaName. + */ + public FieldConfig getFieldConfiguration(String schemaName) { + return fieldConfigs.get(schemaName); + } + + /** + * Checks whether the given table matches the active table schema. + * + * @param tableName The table to check. + * @see #checkTableDefinition(String, String) + */ + public void checkTableDefinition(String tableName) throws TableSchemaMismatchException, TableNotFoundException { + checkTableDefinition(tableName, activeTableSchema); + } + + /** + * Compares the actual table in the database with its definition in the xml + * configuration
+ * Note: This method currently does not check other then primary key columns for + * tables that reference another table, even if those should actually be data + * tables. + *

+ * This method makes use of the {@link #obtainOrReserveConnection()} method to obtain a connection in case + * the current thread has not already obtained one. + *

+ * + * @param tableName - table to check + */ + public void checkTableDefinition(String tableName, String schemaName) throws TableSchemaMismatchException, TableNotFoundException { + try (CoStoSysConnection connection = obtainOrReserveConnection()) { + if (!tableExists(tableName)) + throw new TableNotFoundException("The table '" + tableName + "' does not exist."); + FieldConfig fieldConfig = fieldConfigs.get(schemaName); + + List actualColumns = new ArrayList<>(); + List definedColumns = new ArrayList<>(); + + // Postgres will convert table names to lower case but check for capital + // letter names all the same, thus never + // finding a match when giving names with capital letters. + tableName = tableName.toLowerCase(); + + // ERIK 6th of December 2013: Removed the type information because it + // lead to false positives: When the + // dbcConfiguration specifies an "integer", it actually becomes an + // "int4". This could be treated, for the moment + // only the names will be checked. + String tableType; + if (getReferencedTable(tableName) == null) { // dataTable, check all + tableType = "data"; + // columns + actualColumns = new ArrayList<>(getTableDefinition(tableName)); + for (Map m : fieldConfig.getFields()) + // definedColumns.add(m.get("name") + " " + m.get("type")); + definedColumns.add(m.get(JulieXMLConstants.NAME)); + + } else { // subset table, check only pk-columns + tableType = "subset"; + for (Map m : fieldConfig.getFields()) + if (new Boolean(m.get(JulieXMLConstants.PRIMARY_KEY))) + // definedColumns.add(m.get("name") + " " + m.get("type")); + definedColumns.add(m.get("name")); + + // getting pk-names and types + String schema; + if (tableName.contains(".")) { + schema = tableName.split("\\.")[0]; + tableName = tableName.split("\\.")[1]; + } else + schema = dbConfig.getActivePGSchema(); + + HashSet pkNames = new HashSet(); + + Connection conn = connection.getConnection(); + try { + ResultSet res = conn.getMetaData().getImportedKeys("", schema, tableName); + while (res.next()) + pkNames.add(res.getString("FKCOLUMN_NAME")); + res = conn.getMetaData().getColumns(null, schema, tableName, null); + while (res.next()) { + if (pkNames.contains(res.getString("COLUMN_NAME"))) + // actualColumns.add(res.getString("COLUMN_NAME") + " " + // + res.getString("TYPE_NAME")); + actualColumns.add(res.getString("COLUMN_NAME")); + } + } catch (SQLException e) { + e.printStackTrace(); + } + } + Collections.sort(definedColumns); + Collections.sort(actualColumns); + if (!definedColumns.equals(actualColumns)) { + + String columnType = tableType.equals("subset") ? "primary key " : ""; + throw new TableSchemaMismatchException("The existing " + tableType + " table \"" + tableName + "\" has the following " + + columnType + + "columns: \"" + StringUtils.join(actualColumns, " ") + "\". However, the CoStoSys table " + + "schema \"" + schemaName + "\" that is used to operate on that table specifies a different set of " + columnType + "columns:" + + StringUtils.join(definedColumns, " ") + ". The active table schema is specified in the CoStoSys XML coniguration file."); + } + } + } + + /** + *

+ * Sets the values of is_processed to TRUE and of + * is_in_process to FALSE for a collection of + * documents according to the given primary keys. + *

+ * + * @param subsetTableName name of the subset + * @param primaryKeyList the list of primary keys which itself can consist of several + * primary key elements + */ + public void setProcessed(String subsetTableName, ArrayList primaryKeyList) { + + FieldConfig fieldConfig = fieldConfigs.get(activeTableSchema); + + String whereArgument = StringUtils.join(fieldConfig.expandPKNames("%s = ?"), " AND "); + String update = "UPDATE " + subsetTableName + " SET is_processed = TRUE, is_in_process = FALSE" + " WHERE " + + whereArgument; + + try (CoStoSysConnection conn = obtainOrReserveConnection()) { + conn.setAutoCommit(false); + + PreparedStatement processed = conn.prepareStatement(update); + for (byte[][] primaryKey : primaryKeyList) { + for (int i = 0; i < primaryKey.length; i++) { + processed.setString(i + 1, new String(primaryKey[i])); + } + processed.addBatch(); + } + processed.executeBatch(); + conn.commit(); + + } catch (SQLException e) { + e.printStackTrace(); + } + } + + /** + *

+ * Sets the value of has_errors to TRUE and adds a + * description in log for exceptions which occured during the + * processing of a collection of documents according to the given primary keys. + *

+ * + * @param subsetTableName name of the subset + * @param primaryKeyList the list of primary keys which itself can consist of several + * primary key elements + * @param logException matches primary keys of unsuccessfully processed documents and + * exceptions that occured during the processing + */ + public void setException(String subsetTableName, ArrayList primaryKeyList, + HashMap logException) { + + + FieldConfig fieldConfig = fieldConfigs.get(activeTableSchema); + + String whereArgument = StringUtils.join(fieldConfig.expandPKNames("%s = ?"), " AND "); + String update = "UPDATE " + subsetTableName + " SET has_errors = TRUE, log = ?" + " WHERE " + whereArgument; + + try (CoStoSysConnection conn = obtainOrReserveConnection()) { + conn.setAutoCommit(false); + + PreparedStatement processed = conn.prepareStatement(update); + for (byte[][] primaryKey : primaryKeyList) { + for (int i = 0; i < primaryKey.length; i++) { + processed.setString(1, logException.get(primaryKey)); + processed.setString(i + 2, new String(primaryKey[i])); + } + processed.addBatch(); + } + processed.executeBatch(); + conn.commit(); + + } catch (SQLException e) { + e.printStackTrace(); + } + } + + /** + * Returns the indices of the primary keys, beginning with 0. + */ + public List getPrimaryKeyIndices() { + FieldConfig fieldConfig = fieldConfigs.get(activeTableSchema); + List pkIndices = fieldConfig.getPrimaryKeyFieldNumbers(); + return pkIndices; + } + + public void checkTableSchemaCompatibility(String referenceSchema, String[] schemaNames) throws TableSchemaMismatchException { + String[] schemas = new String[schemaNames.length + 1]; + schemas[0] = referenceSchema; + System.arraycopy(schemaNames, 0, schemas, 1, schemaNames.length); + checkTableSchemaCompatibility(schemas); + } + + public void checkTableSchemaCompatibility(String... schemaNames) throws TableSchemaMismatchException { + if (null == schemaNames || schemaNames.length == 0) { + LOG.warn("No table schema names were passed - nothing to check."); + return; + } + List referenceKey = null; + String referenceSchemaName = null; + List notMatchingSchemaNames = new ArrayList<>(); + for (String schemaName : schemaNames) { + FieldConfig fieldConfig = fieldConfigs.get(schemaName); + String[] primaryKey = fieldConfig.getPrimaryKey(); + List asList = Arrays.asList(primaryKey); + Collections.sort(asList); + if (null == referenceKey) { + referenceKey = asList; + referenceSchemaName = schemaName; + } else { + if (!referenceKey.equals(asList)) + notMatchingSchemaNames.add(schemaName); + } + } + if (!notMatchingSchemaNames.isEmpty()) + throw new TableSchemaMismatchException( + "Found incompatibility of table schema definitions with schemas " + StringUtils.join(schemaNames, ", ") + ": There were at least one table schema pair that is not compatible to each other because their primary keys differ. The table schema \"" + + referenceSchemaName + "\" has the primary key \"" + fieldConfigs.get(referenceSchemaName).getPrimaryKeyString() + "\" which differs from the table schema(s) \"" + + StringUtils.join(notMatchingSchemaNames, ", ") + "\"."); + } + + public String getDbURL() { + return dbURL; + } + + public void setDbURL(String uri) { + dbURL = uri; + } + + public void close() { + releaseConnections(); + LOG.debug("Shutting down DataBaseConnector."); + if (dataSource instanceof HikariDataSource) { + LOG.debug("Checking if the datasource is still in use (perhaps by other threads or other DBC instances)"); + final int activeConnections = dataSource.getHikariPoolMXBean().getActiveConnections(); + final int awaitingConnection = dataSource.getHikariPoolMXBean().getThreadsAwaitingConnection(); + if (activeConnections > 0) { + LOG.debug("Data source is still in use ({} connections active), not closing it. Another DBC instance should exist that will attempt closing the data source at a later time point.", activeConnections); + } else if (awaitingConnection > 0) { + LOG.debug("There are no active connections right now but {} threads await a connection. Letting the data source open. Another DBC instance should close it later.", awaitingConnection); + } else { + LOG.debug("Data source does not have active connections, closing it."); + dataSource.close(); + } + } + } + + public boolean isDatabaseReachable() { + try (CoStoSysConnection ignored = obtainOrReserveConnection()) { + return true; + } catch (Exception e) { + LOG.warn("Got error when trying to connect to {}: {}", getDbURL(), e.getMessage()); + } + return false; + } + + /** + * Adds an auto-generated field configuration that exhibits the given primary key and all the fields required to + * store complete XMI document data (i.e. not segmented XMI parts but the whole serialized CAS) in a database table. + * The field configuration will have the given primary key and an additional field named 'xmi'. + * This method is used by the Jena Document Information + * System (JeDIS) components jcore-xmi-db-reader and jcore-xmi-db-consumer. + * + * @param primaryKey The document primary key for which a document CAS XMI table schema should be created. + * @param doGzip Whether the XMI data should be gzipped in the table. + * @return The created field configuration. + */ + public synchronized FieldConfig addXmiDocumentFieldConfiguration(List> primaryKey, boolean doGzip) { + String referenceSchema = doGzip ? "xmi_complete_cas_gzip" : "xmi_complete_cas"; + return addPKAdaptedFieldConfiguration(primaryKey, referenceSchema, "-complete-cas-xmi-autogenerated"); + } + + public synchronized FieldConfig addPKAdaptedFieldConfiguration(List> primaryKey, String fieldConfigurationForAdaption, String fieldConfigurationNameSuffix) { + List pkNames = primaryKey.stream().map(map -> map.get(JulieXMLConstants.NAME)).collect(Collectors.toList()); + String fieldConfigName = StringUtils.join(pkNames, "-") + fieldConfigurationNameSuffix; + FieldConfig ret; + if (!fieldConfigs.containsKey(fieldConfigName)) { + List> fields = new ArrayList<>(primaryKey); + FieldConfig xmiConfig = fieldConfigs.get(fieldConfigurationForAdaption); + HashSet xmiConfigPkIndices = new HashSet<>(xmiConfig.getPrimaryKeyFieldNumbers()); + // Add those fields to the new configuration that are not the primary key fields + IntStream.range(0, xmiConfig.getFields().size()). + filter(i -> !xmiConfigPkIndices.contains(i)). + mapToObj(i -> xmiConfig.getFields().get(i)). + forEach(fields::add); + ret = new FieldConfig(fields, "", fieldConfigName); + fieldConfigs.put(ret.getName(), ret); + } else { + ret = fieldConfigs.get(fieldConfigs.get(fieldConfigName)); + } + return ret; + } + + /** + * Adds an auto-generated field configuration that exhibits the given primary key and all the fields required to + * store XMI base document data (i.e. the document text but not its annotations) in a database table. The additional fields are + *
    + *
  1. xmi
  2. + *
  3. max_xmi_id
  4. + *
  5. sofa_mapping
  6. + *
+ * and are required for the storage of XMI annotation graph segments stored in other tables. The schema created with + * this method is to be used for the base documents that include the document text. To get a schema with a specific + * primary that stores annotation data, see {@link #addXmiAnnotationFieldConfiguration(List, boolean)}. + * This method is used by the Jena Document Information + * System (JeDIS) components jcore-xmi-db-reader and jcore-xmi-db-consumer. + * + * @param primaryKey The document primary key for which an base document XMI segmentation table schema should be created. + * @param doGzip Whether the XMI data should be gzipped in the table. + * @return The created field configuration. + */ + public synchronized FieldConfig addXmiTextFieldConfiguration(List> primaryKey, boolean doGzip) { + String referenceSchema = doGzip ? "xmi_text_gzip" : "xmi_text"; + return addPKAdaptedFieldConfiguration(primaryKey, referenceSchema, "-xmi-text-autogenerated"); + } + + /** + * Adds an auto-generated field configuration that exhibits the given primary key and all the fields required to + * store XMI annotation data (not base documents) in database tables. The only field besides the primary key is + * xmi and will store the actual XMI annotation data. This table schema + * is used for the storage of XMI annotation graph segments. Those segments will then correspond to + * UIMA annotation types that are stored in tables of their own. A table schema to store the base document + * is created by {@link #addXmiTextFieldConfiguration(List, boolean)}. + * This method is used by the Jena Document Information + * System (JeDIS) components jcore-xmi-db-reader and jcore-xmi-db-consumer. + * + * @param primaryKey The document primary key for which an base document XMI segmentation table schema should be created. + * @param doGzip Whether the XMI data should be gzipped in the table. + * @return The created field configuration. + */ + public synchronized FieldConfig addXmiAnnotationFieldConfiguration(List> primaryKey, boolean doGzip) { + List pkNames = primaryKey.stream().map(map -> map.get(JulieXMLConstants.NAME)).collect(Collectors.toList()); + String fieldConfigName = StringUtils.join(pkNames, "-") + "-xmi-annotations-autogenerated"; + FieldConfig ret; + if (!fieldConfigs.containsKey(fieldConfigName)) { + List> fields = new ArrayList<>(); + // Important: For the annotation tables we don't want to return their primary key. They are used + // as AdditionalTable parameter to the XmiDBReader and the primary key is already returned from the + // data table schema. + // We make a copy of the primary key fields so we can change them without manipulating the given key. + primaryKey.stream().map(HashMap::new).forEach(fields::add); + fields.forEach(pkField -> pkField.put(JulieXMLConstants.RETRIEVE, "false")); + FieldConfig xmiConfig = fieldConfigs.get(doGzip ? "xmi_annotation_gzip" : "xmi_annotation"); + HashSet xmiConfigPkIndices = new HashSet<>(xmiConfig.getPrimaryKeyFieldNumbers()); + // Add those fields to the new configuration that are not the primary key fields + IntStream.range(0, xmiConfig.getFields().size()). + filter(i -> !xmiConfigPkIndices.contains(i)). + mapToObj(i -> xmiConfig.getFields().get(i)). + forEach(fields::add); + ret = new FieldConfig(fields, "", fieldConfigName); + fieldConfigs.put(ret.getName(), ret); + } else { + ret = fieldConfigs.get(fieldConfigs.get(fieldConfigName)); + } + return ret; + } + + public void resetSubset(String subsetTableName, List pkValues) { + try (CoStoSysConnection conn = obtainOrReserveConnection()) { + resetSubset(conn, subsetTableName, pkValues); + } + + } + + /** + * Returns the connection associated with the current thread object. To release used connections back to the connection pool, call {@link #releaseConnections()}. + * + * @return A connection associated with the current thread. + * @throws IllegalStateException If there are no reserved connections for the current thread. + * @see #obtainOrReserveConnection() + * @see #releaseConnections() + * @see #reserveConnection() + */ + public CoStoSysConnection obtainConnection() { + Thread currentThread = Thread.currentThread(); + LOG.trace("Trying to obtain previously reserved connection for thread {}", currentThread.getName()); + List list; + try { + list = connectionCache.get(currentThread); + } catch (ExecutionException e) { + throw new RuntimeException(e); + } + cleanClosedReservedConnections(list, currentThread); + if (list.isEmpty()) + throw new NoReservedConnectionException("There are no reserved connections for the current thread with name \"" + currentThread.getName() + "\". You need to call reserveConnection() before obtaining one."); + // Return the newest connection. The idea is to stick "closer" to the time the connection was reserved so that + // a method can be sure that it reserves a connection for its subcalls. + final CoStoSysConnection conn = list.get(list.size() - 1); + LOG.trace("Obtaining already reserved connection {} for thread {}", conn.getConnection(), currentThread.getName()); + conn.incrementUsageNumber(); + return conn; + } + + /** + *

+ * This is the preferred way to obtain a database connection. It will reuse an existing connection or get a new one if required. + *

+ *

A reserved connection is required by many internal methods that need a database + * connection. They will aquire it by calling {@link #obtainConnection()}. This helps in reusing the same connection + * for multiple tasks within a single thread. This also helps to avoid deadlocks where a single thread requests + * multiple connections from the connection pool in method subcalls, blocking itself.

+ *

+ * Guaranteed to return either an already reserved connection or a newly reserved one. The newlyReserved property of the returned + * object indicates whether the returned connection was newly reserved or not (true / + * false, respectively). To comfortably release the connection only when it was newly reserved, use + * {@link #releaseConnection(CoStoSysConnection)} or simply {@link CoStoSysConnection#release()}. + *

+ * + * @return A pair consisting of connection and the information if it was newly reserved or not. + * @see #releaseConnection(CoStoSysConnection) + */ + public CoStoSysConnection obtainOrReserveConnection() { + LOG.trace("Connection requested, obtained or newly reserved"); + CoStoSysConnection connection; + int reservedConnections = getNumReservedConnections(); + if (reservedConnections == 0) { + connection = reserveConnection(); + } else { + connection = obtainConnection(); + if (LOG.isTraceEnabled()) + LOG.trace("There are connections available, obtained {}", connection.getConnection()); + } + return connection; + } + + public int getNumReservedConnections() { + Thread currentThread = Thread.currentThread(); + List list; + try { + list = connectionCache.get(currentThread); + } catch (ExecutionException e) { + throw new RuntimeException(e); + } + LOG.trace("Currently, there are {} connections reserved for thread {}", list.size(), Thread.currentThread().getName()); + if (!list.isEmpty()) { + cleanClosedReservedConnections(list, currentThread); + LOG.trace("After cleaning, {} connections remain for thread {}", list.size(), Thread.currentThread().getName()); + } + return list.size(); + } + + /** + * Checks the reserved connections whether they have already been closed and removes those from the passed list. + * + * @param list The list of reserved connections of a thread. + */ + private void cleanClosedReservedConnections(List list, Thread thread) { + LOG.trace("Cleaning already closed connections from the list of reserved connections for thread {}", thread.getName()); + Iterator it = list.iterator(); + while (it.hasNext()) { + CoStoSysConnection conn = it.next(); + try { + if (conn.getConnection().isClosed()) { + LOG.trace("Removing connection {} from the list for thread \"{}\" because it is closed.", conn.getConnection(), thread.getName()); + it.remove(); + } + } catch (SQLException e) { + LOG.error("Exception occurred when checking if a connection is closed", e); + } + } + } + + /** + *

Only use when you are sure you need this method. Otherwise, use {@link #obtainOrReserveConnection()}

+ *

+ * Reserves a connection for the current thread. A reserved connection is required by many internal methods that need a database + * connection. They will aquire it by calling {@link #obtainConnection()}. This helps in reusing the same connection + * for multiple tasks within a single thread. This also helps to avoid deadlocks where a single thread requests + * multiple connections from the connection pool in method subcalls, blocking itself. + *

+ *

+ * Note that is possible to reserve multiple connections but that this does not have any positive effect as of now. + * You should always only reserve one connection per thread. After the connection is not required any more, call + * {@link #releaseConnections()} to free the connection. + *

+ * + * @return The newly reserved connection. + * @see #obtainConnection() + * @see #releaseConnections() + */ + public CoStoSysConnection reserveConnection() { + if (LOG.isTraceEnabled()) { + final ConcurrentMap> map = connectionCache.asMap(); + StringBuilder sb = new StringBuilder("Current connection allocation:").append("\n"); + for (Thread t : map.keySet()) { + sb.append("Thread '").append(t.getName()).append("':\t\t").append(map.get(t).size()).append("\n"); + } + LOG.trace(sb.toString()); + } + + Thread currentThread = Thread.currentThread(); + LOG.trace("Trying to reserve a connection for thread \"{}\"", currentThread.getName()); + List list; + try { + list = connectionCache.get(currentThread); + } catch (ExecutionException e) { + throw new RuntimeException(e); + } + int listSize = list.size(); + cleanClosedReservedConnections(list, currentThread); + if (LOG.isTraceEnabled() && list.size() < listSize) { + LOG.trace("The list of connections for thread \"{}\" was shortened from {} to {} due to connections closed in the meantime.", currentThread.getName(), listSize, list.size()); + } + if (list.size() == dbConfig.getMaxConnections()) + LOG.warn("The current thread \"" + currentThread.getName() + "\" has already reserved " + list.size() + " connections. The connection pool is of size " + dbConfig.getMaxConnections() + ". Cannot reserve another connection. Call releaseConnections() to free reserved connections back to the pool. It will be tried to obtain a connection by waiting for one to get free. This might end in a timeout error."); + Connection conn = getConn(); + CoStoSysConnection costoConn = new CoStoSysConnection(this, conn, true); + list.add(costoConn); + LOG.trace("Reserving connection {} for thread \"{}\". This thread has now {} connections reserved.", conn, currentThread.getName(), list.size()); + return costoConn; + } + + /** + * Releases all connections associated with the current thread back to the connection pool. After this call, + * the current thread will not have any reserved connections left. + * + * @see #obtainOrReserveConnection() + * @see #reserveConnection() + * @see #obtainConnection() + */ + public void releaseConnections() { + Thread currentThread = Thread.currentThread(); + LOG.trace("Releasing all connections held for Thread \"{}\"", currentThread.getName()); + List connectionList; + try { + connectionList = connectionCache.get(currentThread); + } catch (ExecutionException e) { + throw new RuntimeException(e); + } + for (CoStoSysConnection conn : connectionList) { + try { + if (!conn.getConnection().isClosed()) { + LOG.trace("Closing connection {}", conn); + conn.getConnection().close(); + } + } catch (SQLException e) { + LOG.error("Could not release connection back to the pool", e); + } + } + connectionList.clear(); + } + + /** + * Removes the given connection from the list of reserved connection of the calling thread. If the connection + * was not reserved by the calling thread, an IllegalArgumentException will be raised. However, it is + * also possible to release connections received from another thread by just closing them via {@link Connection#close()}. + * This should only be used intentionally, however, to avoid confusion. + * + * @param conn + * @throws IllegalArgumentException If the given connection is not associated with the current thread. + */ + public void releaseConnection(CoStoSysConnection conn) throws SQLException { + Thread currentThread = Thread.currentThread(); + LOG.trace("Releasing connection {} for thread \"{}\"", conn.getConnection(), currentThread.getName()); + List connectionList; + try { + connectionList = connectionCache.get(currentThread); + } catch (ExecutionException e) { + throw new RuntimeException(e); + } + // Note that this will not remove anything if the connection is closed by a different thread than the originally reserving one. + // This shouldn't be an issue, however, since we clean up closed connections regularly. + connectionList.remove(conn); + conn.getConnection().close(); + } + + + public Object withConnectionQuery(DbcQuery command) { + Object ret = null; + try (CoStoSysConnection ignored = obtainOrReserveConnection()) { + try { + ret = command.query(this); + } catch (Throwable throwable) { + LOG.error("Could not execute query", throwable); + } + } + return ret; + } + + public boolean withConnectionQueryBoolean(DbcQuery command) { + return (boolean) withConnectionQuery(command); + } + + public int withConnectionQueryInteger(DbcQuery command) { + return (int) withConnectionQuery(command); + } + + public double withConnectionQueryDouble(DbcQuery command) { + return (double) withConnectionQuery(command); + } + + public String withConnectionQueryString(DbcQuery query) { + return (String) withConnectionQuery(query); + } + + public void withConnectionExecute(DbcExecution command) { + boolean close = false; + try (CoStoSysConnection ignored = obtainOrReserveConnection()) { + try { + command.execute(this); + } catch (Throwable throwable) { + LOG.error("Could not execute SQL", throwable); + } + } + } + + public enum StatusElement {HAS_ERRORS, IS_PROCESSED, IN_PROCESS, TOTAL, LAST_COMPONENT} + + /** + * A class to parse xml files and make them accessible with an iterator + * + * @author hellrich + */ + private class XMLPreparer { + private final FieldConfig fieldConfig; + private File fileOrDir; + + protected XMLPreparer(File fileOrDir, FieldConfig fieldConfig) { + this.fileOrDir = fileOrDir; + this.fieldConfig = fieldConfig; + } + + /** + * Parses a xml file according to the FieldConfig for this DatabaseConnector + * + * @param fileName - file to parse + * @return - an iterator, yielding rows for a database + */ + protected Iterator> prepare(String fileName) { + + String xmlFilePath = fileOrDir.getAbsolutePath(); + if (fileOrDir.isDirectory()) { + xmlFilePath = xmlFilePath + "/" + fileName; + } + File xmlFile = new File(xmlFilePath); + boolean hugeFile = false; + if (!fileName.endsWith(".zip") && xmlFile.length() >= 1024 * 1024 * 1024) { + LOG.info("File is larger than 1GB. Trying VTD huge."); + hugeFile = true; + } + return JulieXMLTools.constructRowIterator(xmlFilePath, BUFFER_SIZE, fieldConfig.getForEachXPath(), + fieldConfig.getFields(), hugeFile); + + } + + } + +} diff --git a/src/main/java/de/julielab/xmlData/dataBase/DbcExecution.java b/src/main/java/de/julielab/costosys/dbconnection/DbcExecution.java similarity index 68% rename from src/main/java/de/julielab/xmlData/dataBase/DbcExecution.java rename to src/main/java/de/julielab/costosys/dbconnection/DbcExecution.java index bff0806..1af7eec 100644 --- a/src/main/java/de/julielab/xmlData/dataBase/DbcExecution.java +++ b/src/main/java/de/julielab/costosys/dbconnection/DbcExecution.java @@ -1,4 +1,4 @@ -package de.julielab.xmlData.dataBase; +package de.julielab.costosys.dbconnection; public interface DbcExecution { void execute(DataBaseConnector dbc) throws Throwable; diff --git a/src/main/java/de/julielab/xmlData/dataBase/DbcQuery.java b/src/main/java/de/julielab/costosys/dbconnection/DbcQuery.java similarity index 66% rename from src/main/java/de/julielab/xmlData/dataBase/DbcQuery.java rename to src/main/java/de/julielab/costosys/dbconnection/DbcQuery.java index 5a91ddd..e2c1b88 100644 --- a/src/main/java/de/julielab/xmlData/dataBase/DbcQuery.java +++ b/src/main/java/de/julielab/costosys/dbconnection/DbcQuery.java @@ -1,4 +1,4 @@ -package de.julielab.xmlData.dataBase; +package de.julielab.costosys.dbconnection; public interface DbcQuery { T query(DataBaseConnector dbc) throws Throwable; diff --git a/src/main/java/de/julielab/xmlData/dataBase/QueryHelper.java b/src/main/java/de/julielab/costosys/dbconnection/QueryHelper.java similarity index 92% rename from src/main/java/de/julielab/xmlData/dataBase/QueryHelper.java rename to src/main/java/de/julielab/costosys/dbconnection/QueryHelper.java index 64bd10c..afc50ee 100644 --- a/src/main/java/de/julielab/xmlData/dataBase/QueryHelper.java +++ b/src/main/java/de/julielab/costosys/dbconnection/QueryHelper.java @@ -1,4 +1,4 @@ -package de.julielab.xmlData.dataBase; +package de.julielab.costosys.dbconnection; import java.sql.Connection; import java.sql.ResultSet; diff --git a/src/main/java/de/julielab/xmlData/dataBase/SubsetStatus.java b/src/main/java/de/julielab/costosys/dbconnection/SubsetStatus.java similarity index 97% rename from src/main/java/de/julielab/xmlData/dataBase/SubsetStatus.java rename to src/main/java/de/julielab/costosys/dbconnection/SubsetStatus.java index 521eba4..f683474 100644 --- a/src/main/java/de/julielab/xmlData/dataBase/SubsetStatus.java +++ b/src/main/java/de/julielab/costosys/dbconnection/SubsetStatus.java @@ -1,4 +1,4 @@ -package de.julielab.xmlData.dataBase; +package de.julielab.costosys.dbconnection; import java.text.DecimalFormat; import java.util.Map.Entry; diff --git a/src/main/java/de/julielab/xmlData/dataBase/ThreadedColumnsIterator.java b/src/main/java/de/julielab/costosys/dbconnection/ThreadedColumnsIterator.java similarity index 98% rename from src/main/java/de/julielab/xmlData/dataBase/ThreadedColumnsIterator.java rename to src/main/java/de/julielab/costosys/dbconnection/ThreadedColumnsIterator.java index b7fd3c4..2b2cc22 100644 --- a/src/main/java/de/julielab/xmlData/dataBase/ThreadedColumnsIterator.java +++ b/src/main/java/de/julielab/costosys/dbconnection/ThreadedColumnsIterator.java @@ -1,15 +1,13 @@ -package de.julielab.xmlData.dataBase; +package de.julielab.costosys.dbconnection; +import de.julielab.costosys.configuration.FieldConfig; import de.julielab.xml.JulieXMLConstants; import de.julielab.xml.JulieXMLTools; -import de.julielab.xmlData.config.FieldConfig; import org.apache.commons.lang3.StringUtils; -import org.apache.commons.lang3.tuple.Pair; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; -import java.sql.Connection; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; diff --git a/src/main/java/de/julielab/xmlData/dataBase/ThreadedColumnsToRetrieveIterator.java b/src/main/java/de/julielab/costosys/dbconnection/ThreadedColumnsToRetrieveIterator.java similarity index 99% rename from src/main/java/de/julielab/xmlData/dataBase/ThreadedColumnsToRetrieveIterator.java rename to src/main/java/de/julielab/costosys/dbconnection/ThreadedColumnsToRetrieveIterator.java index 7f2df26..6bf8646 100644 --- a/src/main/java/de/julielab/xmlData/dataBase/ThreadedColumnsToRetrieveIterator.java +++ b/src/main/java/de/julielab/costosys/dbconnection/ThreadedColumnsToRetrieveIterator.java @@ -1,8 +1,8 @@ -package de.julielab.xmlData.dataBase; +package de.julielab.costosys.dbconnection; +import de.julielab.costosys.configuration.FieldConfig; import de.julielab.xml.JulieXMLConstants; import de.julielab.xml.JulieXMLTools; -import de.julielab.xmlData.config.FieldConfig; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.Pair; import org.slf4j.Logger; @@ -90,7 +90,7 @@ public ThreadedColumnsToRetrieveIterator(DataBaseConnector dbc, List i /* * (non-Javadoc) * - * @see de.julielab.xmlData.dataBase.DBCThreadedIterator#destroy() + * @see DBCThreadedIterator#destroy() */ @Override public void close() { diff --git a/src/main/java/de/julielab/xmlData/dataBase/util/CoStoSysException.java b/src/main/java/de/julielab/costosys/dbconnection/util/CoStoSysException.java similarity index 91% rename from src/main/java/de/julielab/xmlData/dataBase/util/CoStoSysException.java rename to src/main/java/de/julielab/costosys/dbconnection/util/CoStoSysException.java index a5cf3ba..0112e9b 100644 --- a/src/main/java/de/julielab/xmlData/dataBase/util/CoStoSysException.java +++ b/src/main/java/de/julielab/costosys/dbconnection/util/CoStoSysException.java @@ -1,4 +1,4 @@ -package de.julielab.xmlData.dataBase.util; +package de.julielab.costosys.dbconnection.util; public class CoStoSysException extends Exception { public CoStoSysException() { diff --git a/src/main/java/de/julielab/xmlData/dataBase/util/CoStoSysRuntimeException.java b/src/main/java/de/julielab/costosys/dbconnection/util/CoStoSysRuntimeException.java similarity index 92% rename from src/main/java/de/julielab/xmlData/dataBase/util/CoStoSysRuntimeException.java rename to src/main/java/de/julielab/costosys/dbconnection/util/CoStoSysRuntimeException.java index 1135d3a..1cc102f 100644 --- a/src/main/java/de/julielab/xmlData/dataBase/util/CoStoSysRuntimeException.java +++ b/src/main/java/de/julielab/costosys/dbconnection/util/CoStoSysRuntimeException.java @@ -1,4 +1,4 @@ -package de.julielab.xmlData.dataBase.util; +package de.julielab.costosys.dbconnection.util; public class CoStoSysRuntimeException extends RuntimeException { public CoStoSysRuntimeException() { diff --git a/src/main/java/de/julielab/xmlData/dataBase/util/CoStoSysSQLRuntimeException.java b/src/main/java/de/julielab/costosys/dbconnection/util/CoStoSysSQLRuntimeException.java similarity index 92% rename from src/main/java/de/julielab/xmlData/dataBase/util/CoStoSysSQLRuntimeException.java rename to src/main/java/de/julielab/costosys/dbconnection/util/CoStoSysSQLRuntimeException.java index 8dcf561..a9c05c0 100644 --- a/src/main/java/de/julielab/xmlData/dataBase/util/CoStoSysSQLRuntimeException.java +++ b/src/main/java/de/julielab/costosys/dbconnection/util/CoStoSysSQLRuntimeException.java @@ -1,4 +1,4 @@ -package de.julielab.xmlData.dataBase.util; +package de.julielab.costosys.dbconnection.util; public class CoStoSysSQLRuntimeException extends CoStoSysRuntimeException { public CoStoSysSQLRuntimeException() { diff --git a/src/main/java/de/julielab/xmlData/dataBase/util/NoReservedConnectionException.java b/src/main/java/de/julielab/costosys/dbconnection/util/NoReservedConnectionException.java similarity index 92% rename from src/main/java/de/julielab/xmlData/dataBase/util/NoReservedConnectionException.java rename to src/main/java/de/julielab/costosys/dbconnection/util/NoReservedConnectionException.java index 5349150..f4e3dce 100644 --- a/src/main/java/de/julielab/xmlData/dataBase/util/NoReservedConnectionException.java +++ b/src/main/java/de/julielab/costosys/dbconnection/util/NoReservedConnectionException.java @@ -1,4 +1,4 @@ -package de.julielab.xmlData.dataBase.util; +package de.julielab.costosys.dbconnection.util; public class NoReservedConnectionException extends CoStoSysRuntimeException { public NoReservedConnectionException() { diff --git a/src/main/java/de/julielab/xmlData/dataBase/util/TableSchemaMismatchException.java b/src/main/java/de/julielab/costosys/dbconnection/util/TableSchemaMismatchException.java similarity index 92% rename from src/main/java/de/julielab/xmlData/dataBase/util/TableSchemaMismatchException.java rename to src/main/java/de/julielab/costosys/dbconnection/util/TableSchemaMismatchException.java index d83cb7b..4e45d5f 100644 --- a/src/main/java/de/julielab/xmlData/dataBase/util/TableSchemaMismatchException.java +++ b/src/main/java/de/julielab/costosys/dbconnection/util/TableSchemaMismatchException.java @@ -1,4 +1,4 @@ -package de.julielab.xmlData.dataBase.util; +package de.julielab.costosys.dbconnection.util; public class TableSchemaMismatchException extends CoStoSysException { public TableSchemaMismatchException() { diff --git a/src/main/java/de/julielab/xmlData/dataBase/util/UnobtainableConnectionException.java b/src/main/java/de/julielab/costosys/dbconnection/util/UnobtainableConnectionException.java similarity index 93% rename from src/main/java/de/julielab/xmlData/dataBase/util/UnobtainableConnectionException.java rename to src/main/java/de/julielab/costosys/dbconnection/util/UnobtainableConnectionException.java index e382758..1f16089 100644 --- a/src/main/java/de/julielab/xmlData/dataBase/util/UnobtainableConnectionException.java +++ b/src/main/java/de/julielab/costosys/dbconnection/util/UnobtainableConnectionException.java @@ -1,4 +1,4 @@ -package de.julielab.xmlData.dataBase.util; +package de.julielab.costosys.dbconnection.util; public class UnobtainableConnectionException extends CoStoSysRuntimeException { public UnobtainableConnectionException() { diff --git a/src/main/java/de/julielab/medline/ConfigurationConstants.java b/src/main/java/de/julielab/costosys/medline/ConfigurationConstants.java similarity index 90% rename from src/main/java/de/julielab/medline/ConfigurationConstants.java rename to src/main/java/de/julielab/costosys/medline/ConfigurationConstants.java index af4cd8c..9af37ff 100644 --- a/src/main/java/de/julielab/medline/ConfigurationConstants.java +++ b/src/main/java/de/julielab/costosys/medline/ConfigurationConstants.java @@ -1,4 +1,4 @@ -package de.julielab.medline; +package de.julielab.costosys.medline; public class ConfigurationConstants { public static final String INSERTION_INPUT = "insertion.directory"; diff --git a/src/main/java/de/julielab/medline/DBCMedlineUtilities.java b/src/main/java/de/julielab/costosys/medline/DBCMedlineUtilities.java similarity index 94% rename from src/main/java/de/julielab/medline/DBCMedlineUtilities.java rename to src/main/java/de/julielab/costosys/medline/DBCMedlineUtilities.java index e8e2754..a3af351 100644 --- a/src/main/java/de/julielab/medline/DBCMedlineUtilities.java +++ b/src/main/java/de/julielab/costosys/medline/DBCMedlineUtilities.java @@ -1,4 +1,4 @@ -package de.julielab.medline; +package de.julielab.costosys.medline; import java.io.ByteArrayOutputStream; import java.io.IOException; diff --git a/src/main/java/de/julielab/medline/ElasticSearchDocumentDeleter.java b/src/main/java/de/julielab/costosys/medline/ElasticSearchDocumentDeleter.java similarity index 99% rename from src/main/java/de/julielab/medline/ElasticSearchDocumentDeleter.java rename to src/main/java/de/julielab/costosys/medline/ElasticSearchDocumentDeleter.java index 2792005..4ee685f 100644 --- a/src/main/java/de/julielab/medline/ElasticSearchDocumentDeleter.java +++ b/src/main/java/de/julielab/costosys/medline/ElasticSearchDocumentDeleter.java @@ -1,4 +1,4 @@ -package de.julielab.medline; +package de.julielab.costosys.medline; import org.apache.commons.configuration2.HierarchicalConfiguration; import org.apache.commons.configuration2.tree.ImmutableNode; diff --git a/src/main/java/de/julielab/medline/IDocumentDeleter.java b/src/main/java/de/julielab/costosys/medline/IDocumentDeleter.java similarity index 94% rename from src/main/java/de/julielab/medline/IDocumentDeleter.java rename to src/main/java/de/julielab/costosys/medline/IDocumentDeleter.java index 52048d4..3f3d31e 100644 --- a/src/main/java/de/julielab/medline/IDocumentDeleter.java +++ b/src/main/java/de/julielab/costosys/medline/IDocumentDeleter.java @@ -1,4 +1,4 @@ -package de.julielab.medline; +package de.julielab.costosys.medline; import org.apache.commons.configuration2.HierarchicalConfiguration; import org.apache.commons.configuration2.tree.ImmutableNode; diff --git a/src/main/java/de/julielab/medline/MedlineDataTableDocumentDeleter.java b/src/main/java/de/julielab/costosys/medline/MedlineDataTableDocumentDeleter.java similarity index 89% rename from src/main/java/de/julielab/medline/MedlineDataTableDocumentDeleter.java rename to src/main/java/de/julielab/costosys/medline/MedlineDataTableDocumentDeleter.java index 064aba7..1e3e071 100644 --- a/src/main/java/de/julielab/medline/MedlineDataTableDocumentDeleter.java +++ b/src/main/java/de/julielab/costosys/medline/MedlineDataTableDocumentDeleter.java @@ -1,7 +1,7 @@ -package de.julielab.medline; +package de.julielab.costosys.medline; -import de.julielab.xmlData.Constants; -import de.julielab.xmlData.dataBase.DataBaseConnector; +import de.julielab.costosys.Constants; +import de.julielab.costosys.dbconnection.DataBaseConnector; import org.apache.commons.configuration2.HierarchicalConfiguration; import org.apache.commons.configuration2.tree.ImmutableNode; import org.slf4j.Logger; diff --git a/src/main/java/de/julielab/medline/MedlineDocumentDeletionException.java b/src/main/java/de/julielab/costosys/medline/MedlineDocumentDeletionException.java similarity index 94% rename from src/main/java/de/julielab/medline/MedlineDocumentDeletionException.java rename to src/main/java/de/julielab/costosys/medline/MedlineDocumentDeletionException.java index a3d1504..6f1c90d 100644 --- a/src/main/java/de/julielab/medline/MedlineDocumentDeletionException.java +++ b/src/main/java/de/julielab/costosys/medline/MedlineDocumentDeletionException.java @@ -1,4 +1,4 @@ -package de.julielab.medline; +package de.julielab.costosys.medline; public class MedlineDocumentDeletionException extends MedlineUpdateException { public MedlineDocumentDeletionException() { diff --git a/src/main/java/de/julielab/medline/MedlineUpdateException.java b/src/main/java/de/julielab/costosys/medline/MedlineUpdateException.java similarity index 93% rename from src/main/java/de/julielab/medline/MedlineUpdateException.java rename to src/main/java/de/julielab/costosys/medline/MedlineUpdateException.java index c08f255..8026d5d 100644 --- a/src/main/java/de/julielab/medline/MedlineUpdateException.java +++ b/src/main/java/de/julielab/costosys/medline/MedlineUpdateException.java @@ -1,4 +1,4 @@ -package de.julielab.medline; +package de.julielab.costosys.medline; public class MedlineUpdateException extends Exception { public MedlineUpdateException() { diff --git a/src/main/java/de/julielab/medline/Updater.java b/src/main/java/de/julielab/costosys/medline/Updater.java similarity index 98% rename from src/main/java/de/julielab/medline/Updater.java rename to src/main/java/de/julielab/costosys/medline/Updater.java index 80af840..1df31cf 100644 --- a/src/main/java/de/julielab/medline/Updater.java +++ b/src/main/java/de/julielab/costosys/medline/Updater.java @@ -1,10 +1,10 @@ -package de.julielab.medline; +package de.julielab.costosys.medline; import de.julielab.xml.JulieXMLConstants; import de.julielab.xml.JulieXMLTools; -import de.julielab.xmlData.Constants; -import de.julielab.xmlData.dataBase.CoStoSysConnection; -import de.julielab.xmlData.dataBase.DataBaseConnector; +import de.julielab.costosys.Constants; +import de.julielab.costosys.dbconnection.CoStoSysConnection; +import de.julielab.costosys.dbconnection.DataBaseConnector; import org.apache.commons.configuration2.HierarchicalConfiguration; import org.apache.commons.configuration2.tree.ImmutableNode; import org.slf4j.Logger; diff --git a/src/main/resources/defaultConfiguration.xml b/src/main/resources/defaultConfiguration.xml index f168709..10e0941 100644 --- a/src/main/resources/defaultConfiguration.xml +++ b/src/main/resources/defaultConfiguration.xml @@ -38,22 +38,6 @@ - - - - - - - - - - - - - - - - @@ -61,16 +45,6 @@ - - - - - - - - diff --git a/src/test/java/de/julielab/xmlData/cli/CLITest.java b/src/test/java/de/julielab/costosys/cli/CLITest.java similarity index 62% rename from src/test/java/de/julielab/xmlData/cli/CLITest.java rename to src/test/java/de/julielab/costosys/cli/CLITest.java index 171f877..ab01a3e 100644 --- a/src/test/java/de/julielab/xmlData/cli/CLITest.java +++ b/src/test/java/de/julielab/costosys/cli/CLITest.java @@ -1,17 +1,16 @@ -package de.julielab.xmlData.cli; +package de.julielab.costosys.cli; +import de.julielab.costosys.cli.CLI; import de.julielab.jcore.db.test.DBTestUtils; -import de.julielab.xmlData.Constants; -import de.julielab.xmlData.dataBase.DataBaseConnector; -import de.julielab.xmlData.dataBase.util.TableSchemaMismatchException; +import de.julielab.costosys.Constants; +import de.julielab.costosys.dbconnection.DataBaseConnector; import org.apache.commons.configuration2.ex.ConfigurationException; import org.testcontainers.containers.PostgreSQLContainer; import org.testng.annotations.AfterClass; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; -import java.sql.SQLException; import static org.assertj.core.api.Assertions.*; public class CLITest { @@ -38,7 +37,7 @@ public static void shutdown(){ @Test public void testImport() throws Exception { - assertThatCode(() -> CLI.main(new String[]{"-i", "src/test/resources/pubmedsample18n0001.xml.gz"})).doesNotThrowAnyException(); + assertThatCode(() -> de.julielab.costosys.cli.CLI.main(new String[]{"-i", "src/test/resources/pubmedsample18n0001.xml.gz"})).doesNotThrowAnyException(); dbc.reserveConnection(); assertThat(dbc.tableExists(Constants.DEFAULT_DATA_TABLE_NAME)); assertThat(dbc.getNumRows(Constants.DEFAULT_DATA_TABLE_NAME)).isEqualTo(177); @@ -46,15 +45,15 @@ public void testImport() throws Exception { @Test(dependsOnMethods = "testImport") public void testCreateSubset() { - assertThatCode(() -> CLI.main(new String[]{"-s", "all_subset", "-a"})).doesNotThrowAnyException(); - assertThatCode(() -> CLI.main(new String[]{"-s", "random_subset", "-r", "10"})).doesNotThrowAnyException(); - assertThatCode(() -> CLI.main(new String[]{"-s", "mirror_subset", "-m"})).doesNotThrowAnyException(); + assertThatCode(() -> de.julielab.costosys.cli.CLI.main(new String[]{"-s", "all_subset", "-a"})).doesNotThrowAnyException(); + assertThatCode(() -> de.julielab.costosys.cli.CLI.main(new String[]{"-s", "random_subset", "-r", "10"})).doesNotThrowAnyException(); + assertThatCode(() -> de.julielab.costosys.cli.CLI.main(new String[]{"-s", "mirror_subset", "-m"})).doesNotThrowAnyException(); } @Test(dependsOnMethods = "testCreateSubset") public void testStatus() { - assertThatCode(() -> CLI.main(new String[]{"-st", "all_subset"})).doesNotThrowAnyException(); - assertThatCode(() -> CLI.main(new String[]{"-st", "random_subset"})).doesNotThrowAnyException(); + assertThatCode(() -> de.julielab.costosys.cli.CLI.main(new String[]{"-st", "all_subset"})).doesNotThrowAnyException(); + assertThatCode(() -> de.julielab.costosys.cli.CLI.main(new String[]{"-st", "random_subset"})).doesNotThrowAnyException(); assertThatCode(() -> CLI.main(new String[]{"-st", "mirror_subset"})).doesNotThrowAnyException(); } } diff --git a/src/test/java/de/julielab/xmlData/config/ConfigReaderTest.java b/src/test/java/de/julielab/costosys/configuration/ConfigReaderTest.java similarity index 87% rename from src/test/java/de/julielab/xmlData/config/ConfigReaderTest.java rename to src/test/java/de/julielab/costosys/configuration/ConfigReaderTest.java index b91c50f..8a0deff 100644 --- a/src/test/java/de/julielab/xmlData/config/ConfigReaderTest.java +++ b/src/test/java/de/julielab/costosys/configuration/ConfigReaderTest.java @@ -13,7 +13,7 @@ * Creation date: 06.04.2011 **/ -package de.julielab.xmlData.config; +package de.julielab.costosys.configuration; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; @@ -25,8 +25,6 @@ import java.util.List; import java.util.Map; -import de.julielab.xmlData.Constants; -import de.julielab.xmlData.dataBase.DataBaseConnector; import org.apache.commons.io.IOUtils; import org.junit.Test; @@ -68,7 +66,7 @@ public void testMergeConfigDataWithAll() throws SecurityException, userConf = IOUtils.toByteArray(is); // Merge default and user configuration. - Method mergeConfigData = ConfigReader.class.getDeclaredMethod( + Method mergeConfigData = de.julielab.costosys.configuration.ConfigReader.class.getDeclaredMethod( "mergeConfigData", byte[].class, byte[].class); mergeConfigData.setAccessible(true); mergedConf = (byte[]) mergeConfigData.invoke(null, defaultConf, @@ -94,7 +92,7 @@ public void testMergeConfigDataWithoutSchema() throws SecurityException, byte[] mergedConf; byte[] mergedConfCorrect; - Method mergeConfigData = ConfigReader.class.getDeclaredMethod( + Method mergeConfigData = de.julielab.costosys.configuration.ConfigReader.class.getDeclaredMethod( "mergeConfigData", byte[].class, byte[].class); mergeConfigData.setAccessible(true); is = ConfigReaderTest.class @@ -106,7 +104,7 @@ public void testMergeConfigDataWithoutSchema() throws SecurityException, userConf = IOUtils.toByteArray(is); // Merge default and user configuration. - mergedConf = (byte[]) mergeConfigData.invoke(new ConfigReader(null), + mergedConf = (byte[]) mergeConfigData.invoke(new de.julielab.costosys.configuration.ConfigReader(null), defaultConf, userConf); // Check whether the result matches the correct version. @@ -129,7 +127,7 @@ public void testMergeConfigDataWithoutDB() throws SecurityException, byte[] mergedConf; byte[] mergedConfCorrect; - Method mergeConfigData = ConfigReader.class.getDeclaredMethod( + Method mergeConfigData = de.julielab.costosys.configuration.ConfigReader.class.getDeclaredMethod( "mergeConfigData", byte[].class, byte[].class); mergeConfigData.setAccessible(true); is = ConfigReaderTest.class @@ -141,7 +139,7 @@ public void testMergeConfigDataWithoutDB() throws SecurityException, userConf = IOUtils.toByteArray(is); // Merge default and user configuration. - mergedConf = (byte[]) mergeConfigData.invoke(new ConfigReader(null), + mergedConf = (byte[]) mergeConfigData.invoke(new de.julielab.costosys.configuration.ConfigReader(null), defaultConf, userConf); // Check whether the result matches the correct version. @@ -158,7 +156,7 @@ public void dbConfigTest() throws VTDException, IOException { // Just read in any configuration defining a database connection. is = ConfigReaderTest.class .getResourceAsStream("/configuration/confWithAll.xml"); - DBConfig dbconf = new DBConfig(IOUtils.toByteArray(is)); + de.julielab.costosys.configuration.DBConfig dbconf = new DBConfig(IOUtils.toByteArray(is)); assertEquals("jdbc:postgresql://aserver.net/aDB", dbconf.getUrl()); assertEquals("anotherschema", dbconf.getActiveDataPGSchema()); } @@ -171,8 +169,8 @@ public void fieldConfigTest() throws VTDException, IOException { .getResourceAsStream("/configuration/confWithAll.xml"); byte[] config = IOUtils.toByteArray(is); String activeSchemaName = ConfigBase.getActiveConfig(config, - ConfigReader.XPATH_ACTIVE_TABLE_SCHEMA); - FieldConfig fc = new FieldConfig(config, activeSchemaName); + de.julielab.costosys.configuration.ConfigReader.XPATH_ACTIVE_TABLE_SCHEMA); + de.julielab.costosys.configuration.FieldConfig fc = new FieldConfig(config, activeSchemaName); List> fields = fc.getFields(); Map field = fields.get(0); @@ -206,10 +204,10 @@ public void fieldConfigTest() throws VTDException, IOException { public void configReaderTest() { InputStream is = null; @SuppressWarnings("unused") - ConfigReader cr = null; + de.julielab.costosys.configuration.ConfigReader cr = null; // It is valid not to deliver a user configuration at all. The default // should be used. This shouldn't raise any error. - cr = new ConfigReader(is); + cr = new de.julielab.costosys.configuration.ConfigReader(is); // Now load a quite normal schema. is = ConfigReaderTest.class @@ -218,20 +216,20 @@ public void configReaderTest() { assertTrue(is != null); // Now check whether the merging of configurations without errors. - cr = new ConfigReader(is); + cr = new de.julielab.costosys.configuration.ConfigReader(is); // Repeat with different kinds of configurations. is = ConfigReaderTest.class .getResourceAsStream("/configuration/confWithoutSchema.xml"); assertTrue(is != null); - cr = new ConfigReader(is); + cr = new de.julielab.costosys.configuration.ConfigReader(is); // ---------------------------- is = ConfigReaderTest.class .getResourceAsStream("/configuration/confWithoutDB.xml"); assertTrue(is != null); - cr = new ConfigReader(is); + cr = new de.julielab.costosys.configuration.ConfigReader(is); } @Test @@ -242,7 +240,7 @@ public void testGetAllSchemaNames() throws SecurityException, // ConfigReader. InputStream is = ConfigReaderTest.class .getResourceAsStream("/configuration/confWithAll.xml"); - ConfigReader cr = new ConfigReader(is); + de.julielab.costosys.configuration.ConfigReader cr = new ConfigReader(is); Method getSchemaNamesMethod = cr.getClass().getDeclaredMethod( "getAllSchemaNames", byte[].class); diff --git a/src/test/java/de/julielab/xmlData/config/FieldConfigTest.java b/src/test/java/de/julielab/costosys/configuration/FieldConfigTest.java similarity index 83% rename from src/test/java/de/julielab/xmlData/config/FieldConfigTest.java rename to src/test/java/de/julielab/costosys/configuration/FieldConfigTest.java index 02e8234..ba96233 100644 --- a/src/test/java/de/julielab/xmlData/config/FieldConfigTest.java +++ b/src/test/java/de/julielab/costosys/configuration/FieldConfigTest.java @@ -16,8 +16,8 @@ /** * */ -package de.julielab.xmlData.config; -import static de.julielab.xmlData.config.FieldConfig.createField; +package de.julielab.costosys.configuration; +import static de.julielab.costosys.configuration.FieldConfig.createField; import static org.assertj.core.api.Assertions.*; import static org.junit.Assert.assertEquals; @@ -48,7 +48,7 @@ public class FieldConfigTest { public void testBuildFields() throws SecurityException, NoSuchMethodException, FileNotFoundException, IOException, VTDException { byte[] configData = IOUtils.toByteArray(new FileInputStream("src/test/resources/configuration/confWithAll.xml")); // Test schema without explicit field closing tags. - FieldConfig fieldConfig = new FieldConfig(configData, "userTableSchema1"); + de.julielab.costosys.configuration.FieldConfig fieldConfig = new de.julielab.costosys.configuration.FieldConfig(configData, "userTableSchema1"); List> fields = fieldConfig.getFields(); Map field1 = fields.get(0); Map field2 = fields.get(1); @@ -69,7 +69,7 @@ public void testBuildFields() throws SecurityException, NoSuchMethodException, F assertEquals("true", field2.get(JulieXMLConstants.RETRIEVE)); // Test schema with explicit field closing tags. - fieldConfig = new FieldConfig(configData, "userTableSchema2"); + fieldConfig = new de.julielab.costosys.configuration.FieldConfig(configData, "userTableSchema2"); fields = fieldConfig.getFields(); field1 = fields.get(0); field2 = fields.get(1); @@ -99,11 +99,11 @@ public void testIncompleteProgramaticallyDefinedFieldConfig() { field.put(JulieXMLConstants.NAME, "field1"); fields.add(field); // The type property is missing - assertThatThrownBy(() -> new FieldConfig(fields, "", "testschema")).hasMessageContaining("required \"" + JulieXMLConstants.TYPE + "\" property"); + assertThatThrownBy(() -> new de.julielab.costosys.configuration.FieldConfig(fields, "", "testschema")).hasMessageContaining("required \"" + JulieXMLConstants.TYPE + "\" property"); field.remove(JulieXMLConstants.NAME); field.put(JulieXMLConstants.TYPE, "type1"); // Now the name property is missing - assertThatThrownBy(() -> new FieldConfig(fields, "", "testschema")).hasMessageContaining("required \"" + JulieXMLConstants.NAME + "\" property"); + assertThatThrownBy(() -> new de.julielab.costosys.configuration.FieldConfig(fields, "", "testschema")).hasMessageContaining("required \"" + JulieXMLConstants.NAME + "\" property"); } @Test @@ -113,7 +113,7 @@ public void testProgrammaticallyDefinedFieldConfig() { fields.add(createField(JulieXMLConstants.NAME, "field1",JulieXMLConstants.TYPE, "type1", JulieXMLConstants.PRIMARY_KEY, "true")); fields.add(createField(JulieXMLConstants.NAME, "field2", JulieXMLConstants.TYPE, "type2", JulieXMLConstants.PRIMARY_KEY, "true")); fields.add(createField(JulieXMLConstants.NAME, "field3",JulieXMLConstants.TYPE, "type3", JulieXMLConstants.RETRIEVE, "true")); - FieldConfig config = new FieldConfig(fields, "foreach", "testschema"); + de.julielab.costosys.configuration.FieldConfig config = new FieldConfig(fields, "foreach", "testschema"); assertThat(config.getPrimaryKeyString()).isEqualToIgnoringWhitespace("field1,field2"); assertThat(config.getColumnsToRetrieve()).isEqualTo(new String[]{"field3"}); assertThat(config.getForEachXPath()).isEqualTo("foreach"); diff --git a/src/test/java/de/julielab/xmlData/dataBase/DataBaseConnectorTest.java b/src/test/java/de/julielab/costosys/dbconnection/DataBaseConnectorTest.java similarity index 90% rename from src/test/java/de/julielab/xmlData/dataBase/DataBaseConnectorTest.java rename to src/test/java/de/julielab/costosys/dbconnection/DataBaseConnectorTest.java index 3be86c9..462815e 100644 --- a/src/test/java/de/julielab/xmlData/dataBase/DataBaseConnectorTest.java +++ b/src/test/java/de/julielab/costosys/dbconnection/DataBaseConnectorTest.java @@ -1,8 +1,8 @@ -package de.julielab.xmlData.dataBase; +package de.julielab.costosys.dbconnection; -import de.julielab.xmlData.Constants; -import de.julielab.xmlData.cli.TableNotFoundException; -import de.julielab.xmlData.dataBase.util.TableSchemaMismatchException; +import de.julielab.costosys.Constants; +import de.julielab.costosys.cli.TableNotFoundException; +import de.julielab.costosys.dbconnection.util.TableSchemaMismatchException; import org.testcontainers.containers.PostgreSQLContainer; import org.testng.annotations.AfterClass; import org.testng.annotations.BeforeClass; @@ -20,13 +20,13 @@ public class DataBaseConnectorTest { public static PostgreSQLContainer postgres; - private static DataBaseConnector dbc; + private static de.julielab.costosys.dbconnection.DataBaseConnector dbc; @BeforeClass public static void setup() { postgres = new PostgreSQLContainer(); postgres.start(); - dbc = new DataBaseConnector(postgres.getJdbcUrl(), postgres.getUsername(), postgres.getPassword()); + dbc = new de.julielab.costosys.dbconnection.DataBaseConnector(postgres.getJdbcUrl(), postgres.getUsername(), postgres.getPassword()); dbc.setActiveTableSchema("medline_2016"); } @@ -104,7 +104,7 @@ public void testQuerySubset() throws SQLException { dbc.initSubset("querysubset", Constants.DEFAULT_DATA_TABLE_NAME); assertThat(dbc.getNumRows("querysubset")).isGreaterThan(0); dbc.releaseConnections(); - DBCIterator it = dbc.querySubset("querysubset", 0); + de.julielab.costosys.dbconnection.DBCIterator it = dbc.querySubset("querysubset", 0); Set retrieved = new HashSet<>(); while (it.hasNext()) { byte[][] next = it.next(); diff --git a/src/test/java/de/julielab/xmlData/dataBase/ThreadedColumnsIteratorTest.java b/src/test/java/de/julielab/costosys/dbconnection/ThreadedColumnsIteratorTest.java similarity index 73% rename from src/test/java/de/julielab/xmlData/dataBase/ThreadedColumnsIteratorTest.java rename to src/test/java/de/julielab/costosys/dbconnection/ThreadedColumnsIteratorTest.java index d25522a..1189eba 100644 --- a/src/test/java/de/julielab/xmlData/dataBase/ThreadedColumnsIteratorTest.java +++ b/src/test/java/de/julielab/costosys/dbconnection/ThreadedColumnsIteratorTest.java @@ -1,6 +1,6 @@ -package de.julielab.xmlData.dataBase; +package de.julielab.costosys.dbconnection; -import de.julielab.xmlData.Constants; +import de.julielab.costosys.Constants; import org.junit.BeforeClass; import org.junit.ClassRule; import org.junit.Test; @@ -9,7 +9,6 @@ import org.testcontainers.containers.PostgreSQLContainer; import java.io.IOException; -import java.sql.Connection; import java.sql.SQLException; import java.util.Arrays; @@ -19,7 +18,7 @@ public class ThreadedColumnsIteratorTest { private final static Logger log = LoggerFactory.getLogger(ThreadedColumnsIteratorTest.class); @ClassRule public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer(); - private static DataBaseConnector dbc; + private static de.julielab.costosys.dbconnection.DataBaseConnector dbc; @BeforeClass public static void setup() throws SQLException, IOException { @@ -36,7 +35,7 @@ public static void setup() throws SQLException, IOException { @Test public void testIterator() throws SQLException { try (CoStoSysConnection conn = dbc.reserveConnection()) { - ThreadedColumnsIterator it = new ThreadedColumnsIterator(dbc, conn, Arrays.asList("pmid", "xml"), Constants.DEFAULT_DATA_TABLE_NAME); + de.julielab.costosys.dbconnection.ThreadedColumnsIterator it = new de.julielab.costosys.dbconnection.ThreadedColumnsIterator(dbc, conn, Arrays.asList("pmid", "xml"), Constants.DEFAULT_DATA_TABLE_NAME); int numRetrieved = 0; while (it.hasNext()) { Object[] next = it.next(); @@ -50,7 +49,7 @@ public void testIterator() throws SQLException { @Test public void testIteratorWithoutExternalConnection() throws InterruptedException { // Repeat the very same lines of code a few times to make sure that connections are released properly - ThreadedColumnsIterator it = new ThreadedColumnsIterator(dbc, Arrays.asList("pmid", "xml"), Constants.DEFAULT_DATA_TABLE_NAME); + de.julielab.costosys.dbconnection.ThreadedColumnsIterator it = new de.julielab.costosys.dbconnection.ThreadedColumnsIterator(dbc, Arrays.asList("pmid", "xml"), Constants.DEFAULT_DATA_TABLE_NAME); int numRetrieved = 0; while (it.hasNext()) { Object[] next = it.next(); @@ -59,7 +58,7 @@ public void testIteratorWithoutExternalConnection() throws InterruptedException } assertEquals(10, numRetrieved); - it = new ThreadedColumnsIterator(dbc, Arrays.asList("pmid", "xml"), Constants.DEFAULT_DATA_TABLE_NAME); + it = new de.julielab.costosys.dbconnection.ThreadedColumnsIterator(dbc, Arrays.asList("pmid", "xml"), Constants.DEFAULT_DATA_TABLE_NAME); numRetrieved = 0; while (it.hasNext()) { Object[] next = it.next(); @@ -68,7 +67,7 @@ public void testIteratorWithoutExternalConnection() throws InterruptedException } assertEquals(10, numRetrieved); - it = new ThreadedColumnsIterator(dbc, Arrays.asList("pmid", "xml"), Constants.DEFAULT_DATA_TABLE_NAME); + it = new de.julielab.costosys.dbconnection.ThreadedColumnsIterator(dbc, Arrays.asList("pmid", "xml"), Constants.DEFAULT_DATA_TABLE_NAME); numRetrieved = 0; while (it.hasNext()) { Object[] next = it.next(); @@ -83,7 +82,7 @@ public void testIteratorWithoutExternalConnection() throws InterruptedException @Test public void testIteratorWithLimit() throws SQLException { try (CoStoSysConnection conn = dbc.reserveConnection()) { - ThreadedColumnsIterator it = new ThreadedColumnsIterator(dbc, conn, Arrays.asList("pmid", "xml"), Constants.DEFAULT_DATA_TABLE_NAME, 2); + de.julielab.costosys.dbconnection.ThreadedColumnsIterator it = new ThreadedColumnsIterator(dbc, conn, Arrays.asList("pmid", "xml"), Constants.DEFAULT_DATA_TABLE_NAME, 2); int numRetrieved = 0; while (it.hasNext()) { Object[] next = it.next(); diff --git a/src/test/java/de/julielab/xmlData/dataBase/ThreadedColumnsToRetrieveIteratorTest.java b/src/test/java/de/julielab/costosys/dbconnection/ThreadedColumnsToRetrieveIteratorTest.java similarity index 74% rename from src/test/java/de/julielab/xmlData/dataBase/ThreadedColumnsToRetrieveIteratorTest.java rename to src/test/java/de/julielab/costosys/dbconnection/ThreadedColumnsToRetrieveIteratorTest.java index a89acaa..e31e7f6 100644 --- a/src/test/java/de/julielab/xmlData/dataBase/ThreadedColumnsToRetrieveIteratorTest.java +++ b/src/test/java/de/julielab/costosys/dbconnection/ThreadedColumnsToRetrieveIteratorTest.java @@ -1,6 +1,6 @@ -package de.julielab.xmlData.dataBase; +package de.julielab.costosys.dbconnection; -import de.julielab.xmlData.Constants; +import de.julielab.costosys.Constants; import org.junit.BeforeClass; import org.junit.ClassRule; import org.junit.Test; @@ -9,10 +9,8 @@ import org.testcontainers.containers.PostgreSQLContainer; import java.io.IOException; -import java.sql.Connection; import java.sql.SQLException; import java.util.Arrays; -import java.util.List; import static org.junit.Assert.assertEquals; @@ -20,7 +18,7 @@ public class ThreadedColumnsToRetrieveIteratorTest { private final static Logger log = LoggerFactory.getLogger(ThreadedColumnsToRetrieveIteratorTest.class); @ClassRule public static PostgreSQLContainer postgres = (PostgreSQLContainer) new PostgreSQLContainer(); - private static DataBaseConnector dbc; + private static de.julielab.costosys.dbconnection.DataBaseConnector dbc; @BeforeClass public static void setup() throws SQLException, IOException { @@ -37,7 +35,7 @@ public static void setup() throws SQLException, IOException { @Test public void testIterator() throws Exception { try (CoStoSysConnection conn = dbc.reserveConnection()) { - ThreadedColumnsToRetrieveIterator it = new ThreadedColumnsToRetrieveIterator(dbc, conn, Arrays.asList(new Object[]{"10922238"}), Constants.DEFAULT_DATA_TABLE_NAME, "medline_2016"); + de.julielab.costosys.dbconnection.ThreadedColumnsToRetrieveIterator it = new de.julielab.costosys.dbconnection.ThreadedColumnsToRetrieveIterator(dbc, conn, Arrays.asList(new Object[]{"10922238"}), Constants.DEFAULT_DATA_TABLE_NAME, "medline_2016"); int numRetrieved = 0; while (it.hasNext()) { Object[] next = it.next(); @@ -50,7 +48,7 @@ public void testIterator() throws Exception { @Test public void testIteratorWithoutExternalConnection() throws Exception { - ThreadedColumnsToRetrieveIterator it = new ThreadedColumnsToRetrieveIterator(dbc, null, Arrays.asList(new Object[]{"10922238"}), Constants.DEFAULT_DATA_TABLE_NAME, "medline_2016"); + de.julielab.costosys.dbconnection.ThreadedColumnsToRetrieveIterator it = new ThreadedColumnsToRetrieveIterator(dbc, null, Arrays.asList(new Object[]{"10922238"}), Constants.DEFAULT_DATA_TABLE_NAME, "medline_2016"); int numRetrieved = 0; while (it.hasNext()) { Object[] next = it.next();