package textbender.d.gene; // Copyright 2006-2007, Michael Allan. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Textbender Software"), to deal in the Textbender Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicence, and/or sell copies of the Textbender Software, and to permit persons to whom the Textbender Software is furnished to do so, subject to the following conditions: The preceding copyright notice and this permission notice shall be included in all copies or substantial portions of the Textbender Software. THE TEXTBENDER SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE TEXTBENDER SOFTWARE OR THE USE OR OTHER DEALINGS IN THE TEXTBENDER SOFTWARE. import java.util.*; import org.w3c.dom.*; import textbender._.*; import textbender.g.lang.*; import textbender.g.xml.dom.*; import static textbender._.Textbender.TEXTBENDER_NAMESPACE; /** Utilities for working with genes. * * @see * gene.mod */ public final @ThreadSafe class Gene { private Gene() {} /** Tranforms a genetic-form sequence into an abstract form suitable * for embeddable encoding. * * @param sequence genetic-form sequence * @param gList list of gene meta-data ('g') elements in the document * * @see d/gene/note.xht#Abstraction */ public static void abstract_embeddable( final Element sequence, final List gList ) { // 1. Remove meta-data 'g' attribute. // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - sequence.removeAttributeNS( TEXTBENDER_NAMESPACE, "g" ); // 2. Substitute stub elements for contained genes. // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - final Document document = sequence.getOwnerDocument(); final Node originalFirstChild = sequence.getFirstChild(); for( Node child = originalFirstChild; child != null; child = child.getNextSibling() ) { if( !( child instanceof Element )) continue; final Element element = (Element)child; final int gIndex = gIndexOf( element ); if( !gIndexIsGene( gIndex )) continue; sequence.insertBefore // inserting stubs to head of sequence (tail to be deleted later) ( createStub_embeddable(gList.get(gIndex)), originalFirstChild ); } final boolean isLeaf = sequence.getFirstChild() == originalFirstChild; if( isLeaf ) { // substitute stubs in depth // ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` for( Node child = originalFirstChild; child != null; child = child.getNextSibling() ) { abstract_embeddable_stubifyDescendants( child, gList ); // not child itself, which cannot be a gene in this case } } else // structural parent { // remove all but stubs // ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` Node child; Node c = sequence.getLastChild(); for( ;; ) { child = c; c = c.getPreviousSibling(); sequence.removeChild( child ); if( child == originalFirstChild ) break; } } // 3. Flatten namespaces. // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - if( isLeaf ) abstract_embeddable_removeNamespacesRecursively( sequence ); else abstract_embeddable_removeNamespace( sequence ); // descendants have none, in this case } private static void abstract_embeddable_removeNamespace( final Node node ) { if( node.getNamespaceURI() == null ) return; node.getOwnerDocument().renameNode( node, null, node.getLocalName() ); } private static void abstract_embeddable_removeNamespacesRecursively( final Node node ) { for( Node child = node.getFirstChild(); child != null; child = child.getNextSibling() ) { abstract_embeddable_removeNamespacesRecursively( child ); } abstract_embeddable_removeNamespace( node ); if( node.hasAttributes() ) { final NamedNodeMap map = node.getAttributes(); for( int a = map.getLength() - 1; a >= 0; --a ) { Attr attr = (Attr)map.item( a ); abstract_embeddable_removeNamespace( attr ); } } } private static void abstract_embeddable_stubifyDescendants( Node node, List gList ) { for( Node child = node.getFirstChild(); child != null; child = child.getNextSibling() ) { if( child instanceof Element ) { final Element element = (Element)node; final int gIndex = gIndexOf( element ); if( gIndexIsGene( gIndex )) { Element newChild = createStub_embeddable( gList.get( gIndex )); element.getParentNode().replaceChild( newChild, child ); child = newChild; continue; } } abstract_embeddable_stubifyDescendants( child, gList ); } } /** Adjusts the left padding (leading space content) of an element. * * @param element whose padding to adjust * @param delta number of space characters to remove (negative) * or add (positive) * @param b string builder to use, overwriting its existing content */ public static void adjustLeftPadding( final Element element, int delta, final StringBuilder b ) { final Node firstChild = element.getFirstChild(); if( firstChild == null ) return; // empty element, has no padding to adjust b.delete( 0, Integer.MAX_VALUE ); if( firstChild instanceof Text ) b.append( ((Text)firstChild).getData() ); if( delta < 0 ) // remove some padding { for( ;; ) { if( b.length() == 0 || b.charAt(0) != ' ' ) break; // No more space padding to remove. Only spaces supported; tabs are non-trivial (see genetic encoder for examples), but will often self-adjust, so leave them unadjusted for now. b.deleteCharAt( 0 ); ++delta; if( delta >= 0 ) break; } } else // add some padding { if( b.length() != 0 && b.charAt(0) == '\n' ) return; // no content on this line to bother about padding (odd case, since margin genes are usually leaves on a single line) for( ;; ) { b.insert( 0, ' ' ); --delta; if( delta <= 0 ) break; } } if( firstChild instanceof Text ) ((Text)firstChild).setData( b.toString() ); else { if( b.length() == 0 ) return; // no padding was added Text text = element.getOwnerDocument().createTextNode( b.toString() ); element.insertBefore( text, firstChild ); } } /** Constructs an unattached gene meta-data ('g') element. * * @param locus of the gene * @param gg the meta-data 'gg' element, * to which caller will attach new 'g' element * @param b string builder to use, overwriting its existing content */ public static Element createG( final String locus, final Element gg, final StringBuilder b ) { Element g = gg.getOwnerDocument().createElementNS ( TEXTBENDER_NAMESPACE, DOM.buildElementPrefix( gg, TEXTBENDER_NAMESPACE, b ).append( "g" ).toString() ); g.setAttributeNS( null, "locus", locus ); return g; } /** Constructs an unattached 'g' list ('gg') element. * * @param metaData the document's * {@linkplain DocumentRT#findMetaData(Node) meta-data element}, * to which caller will attach new 'gg' element * @param b string builder to use, overwriting its existing content * * @see #ensureGG(Element,StringBuilder) */ public static Element createGG( final Element metaData, final StringBuilder b ) { Element gg = metaData.getOwnerDocument().createElementNS ( TEXTBENDER_NAMESPACE, DOM.buildElementPrefix( metaData, TEXTBENDER_NAMESPACE, b ).append( "gg" ).toString() ); return gg; } private static Element createStub_embeddable( Element g ) { final String locus = g.getAttributeNS( null, "locus" ); Element stub = g.getOwnerDocument().createElementNS( null, EMBEDDABLE_STUB_LOCAL_NAME ); stub.setAttributeNS( null, "locus", locus ); return stub; } /** Local name of stub elements that represent contained genes * in embeddable forms. The namespace is always null. */ public static final String EMBEDDABLE_STUB_LOCAL_NAME = "_"; /** Returns a document's 'gg' element; if necessary creating it. * * @param metaData the document's * {@linkplain DocumentRT#findMetaData(Node) meta-data element} * @param b string builder to use, overwriting its existing content * * @see #createGG(Element,StringBuilder) * @see #findGG(Element) */ public static Element ensureGG( final Element metaData, StringBuilder b ) { Element gg = findGG( metaData ); if( gg == null ) { gg = createGG( metaData, b ); final Document document = metaData.getOwnerDocument(); gg.appendChild( document.createTextNode( "\n " )); gg.appendChild( document.createComment( " <<< " )); gg.appendChild( document.createTextNode( "\n" )); metaData.appendChild( gg ); metaData.appendChild( document.createTextNode( "\n" )); metaData.appendChild( document.createComment( " >>> " )); metaData.appendChild( document.createTextNode( "\n " )); } return gg; } /** Tranforms an embeddable sequence from unescaped to escaped form. * * @param b string builder containing the unescaped sequence * * @see #unescape_embeddable(StringBuilder) * @see d/gene/note.xht#Escaping */ public static void escape_embeddable( final StringBuilder b ) { for( int c = b.length() - 1; c >= 0; --c ) { final char ch = b.charAt( c ); final char ch2; if( ch == '\\' ) ch2 = '\\'; else if( ch == '\n' ) ch2 = 'n'; else if( ch == '\r' ) ch2 = 'r'; else if( ch == '<' ) ch2 = '('; else if( ch == '>' ) ch2 = ')'; else if( ch == '\'' ) ch2 = '`'; else if( ch == '"' ) ch2 = '~'; else continue; b.setCharAt( c, ch2 ); b.insert( c, '\\' ); } } /** Finds a document's 'gg' element. * * @param metaData the document's * {@linkplain DocumentRT#findMetaData(Node) meta-data element} * @return 'gg' element, or null if none found * * @see #ensureGG(Element,StringBuilder) */ public static Element findGG( final Element metaData ) { for( Node child = metaData.getFirstChild(); child != null; child = child.getNextSibling() ) { if( !( child instanceof Element )) continue; if( "gg".equals( child.getLocalName() ) && TEXTBENDER_NAMESPACE.equals( child.getNamespaceURI() )) return (Element)child; } return null; } /** G-index flag indicating that a node has no 'g' attribute. * * @see #gIndexOf(Element) */ public static final int G_INDEX_FLAG_NONE = -1; /** G-index flag indicating an unrecognized 'g' attribute value. * * @see #gIndexOf(String) */ public static final int G_INDEX_FLAG_UNKNOWN = -2; /** G-index flag indicating a 'g' attribute value * that begins with a single '-'. * * @see #gIndexOf(String) * @see #gIndexIsNonGenetic(int) */ public static final int G_INDEX_FLAG_SINGLE_DASH = -3; /** G-index flag indicating a 'g' attribute value that begins '--'. * * @see #gIndexOf(String) * @see #gIndexIsNonGenetic(int) */ public static final int G_INDEX_FLAG_DOUBLE_DASH = -4; /** Returns true iff {@linkplain #gIndexOf(Element) gIndexOf} * indicates a non-genetic element; either * {@linkplain #G_INDEX_FLAG_SINGLE_DASH G_INDEX_FLAG_SINGLE_DASH} or * {@linkplain #G_INDEX_FLAG_DOUBLE_DASH G_INDEX_FLAG_DOUBLE_DASH}. */ public static boolean gIndexIsNonGenetic( final int gIndex ) { return gIndex == G_INDEX_FLAG_SINGLE_DASH || gIndex == G_INDEX_FLAG_DOUBLE_DASH; } /** Returns true iff {@linkplain #gIndexOf(Element) gIndexOf} >= 0, * indicating a gene. */ public static boolean gIndexIsGene( final int gIndex ) { return gIndex >= 0; } /** Returns the g-index of an element, derived from its 'g' attribute value. * * @return g-index or flag per {@linkplain #gIndexOf(String) gIndexOf}(String); * or {@linkplain #G_INDEX_FLAG_NONE G_INDEX_FLAG_NONE} * if the element has no 'g' attribute * * @see #gIndexOfNode(Node) */ public static int gIndexOf( final Element element ) { final String g = element.getAttributeNS( TEXTBENDER_NAMESPACE, "g" ); if( g.length() == 0 ) return G_INDEX_FLAG_NONE; return gIndexOf( g ); } /** Returns the g-index of a node. * * @return g-index or flag per {@linkplain #gIndexOf(String) gIndexOf}(String); * or {@linkplain #G_INDEX_FLAG_NONE G_INDEX_FLAG_NONE} * if the node has no 'g' attribute */ public static int gIndexOfNode( final Node node ) { final int gIndex; if( node instanceof Element ) gIndex = gIndexOf( (Element)node ); else gIndex = G_INDEX_FLAG_NONE; return gIndex; } /** Returns the g-index of a 'g' attribute value. * * @return g-index (>= 0) if the 'g' attribute designates a gene; * else a flag (< 0): * {@linkplain #G_INDEX_FLAG_UNKNOWN G_INDEX_FLAG_UNKNOWN}, * {@linkplain #G_INDEX_FLAG_SINGLE_DASH G_INDEX_FLAG_SINGLE_DASH}, or * {@linkplain #G_INDEX_FLAG_DOUBLE_DASH G_INDEX_FLAG_DOUBLE_DASH} * * @see #gIndexOf(Element) * @see #gOf(int) */ public static int gIndexOf( final String g ) { final char c = g.charAt( 0 ); if( c == '-' ) { if( g.length() > 1 && g.charAt(1) == '-' ) return G_INDEX_FLAG_DOUBLE_DASH; else return G_INDEX_FLAG_SINGLE_DASH; } if( c >= '0' && c <= '9' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' ) return Integer.parseInt( g, 36 ); return G_INDEX_FLAG_UNKNOWN; } /** Returns the g-index encoded as a string * suitable for a 'g' attribute value. * * @see #gIndexOf(String) */ public static String gOf( final int gIndex ) { if( gIndex < 0 ) throw new IndexOutOfBoundsException(); String g = Integer.toString( gIndex, /*radix*/36 ); // if( g.length() == 1 ) g = "0" + g; // zero-padded to at least two characters, a temporary fudge to reduce the likelihood of minor misalignments in leaf genes generated by the genetic encoder return g; } /** Returns the effective 'gR' attribute of an element. * * @param element e.g. a meta-data 'gg' or ancestor 'a' element * * @return 'gR' attribute value if present, else "0" */ public static String gROf( final Element element ) { String gR = element.getAttributeNS( null, "gR" ); if( gR.length() == 0 ) gR = "0"; return gR; } /** Returns true iff the node has a child gene. * Where the node is itself a gene, * true indicates a parent gene; false a leaf. */ public static boolean hasChildGene( final Node node ) { for( Node child = node.getFirstChild(); child != null; child = child.getNextSibling() ) { if( child instanceof Element && isGene( (Element)child )) return true; } return false; } /** Returns true if the element is a gene. * An element is a gene if it has a g-index. * * @see #gIndexOf(Element) */ public static boolean isGene( final Element element ) { return gIndexOf(element) >= 0; } /** Links a gene meta-data ('g') element into the document. * Linkage entails 1) appending the 'g' element to the 'gg' list; * and 2) setting the 'g' attribute of the sequenceElement to point to it. * * @param g gene meta-data element * @param sequenceElement element that will become the gene; * it should itself be attached in the document, * or the namespace prefix lookup may fail * @param gg the meta-data 'gg' element * @param gIndex prior count of 'g' elements, * and offset for the newly created 'g' element, in 'gg' list * @param b string builder to use, overwriting its existing content * * @return g-index offset of the newly created 'g' element in 'gg' list * (same as passed in) * * @see #createG(String,Element,StringBuilder) */ public static int linkG( final Element g, final Element sequenceElement, final Element gg, final int gIndex, final StringBuilder b ) { final Document document = gg.getOwnerDocument(); final String gString = gOf( gIndex ); if( gString.charAt(gString.length()-1) == '0' ) // echo g-index in comments every so often, as debugging aid { gg.appendChild( document.createTextNode( "\n" )); b.delete( 0, Integer.MAX_VALUE ); b.append( ' ' ); for( int padCount = 3 - gString.length(); padCount > 0; --padCount ) b.append( ' ' ); b.append( gString ); b.append( ' ' ); gg.appendChild( document.createComment( b.toString() )); gg.appendChild( document.createTextNode( "\n" )); } gg.appendChild( g ); gg.appendChild( document.createTextNode( "\n" )); sequenceElement.setAttributeNS ( TEXTBENDER_NAMESPACE, DOM.buildAttributePrefix( sequenceElement, TEXTBENDER_NAMESPACE, b ) .append( 'g' ).toString(), gString ); return gIndex; } /** Returns the node if it is a gene; * otherwise the nearest ancestor that is a gene. * * @param node reference node, which may be null * @return node, or its nearest ancestor gene, or null */ public static Element selfOrAncestorAsGene( Node node ) { for( ;; node = node.getParentNode() ) { // if( node == null || node instanceof Document ) return null; if( node == null ) return null; if( node instanceof Element && isGene( (Element)node )) return (Element)node; } // return null; } /** Tranforms an embeddable sequence from escaped to unescaped form. * * @param b string builder containing the escaped sequence * * @see #escape_embeddable(StringBuilder) * @see d/gene/note.xht#Escaping */ public static void unescape_embeddable( final StringBuilder b ) { for( int c2 = 1; c2 < b.length(); ++c2 ) { final int c1 = c2 - 1; if( b.charAt(c1) != '\\' ) continue; char ch2 = b.charAt( c2 ); final char ch; if( ch2 == '\\' ) ch = '\\'; else if( ch2 == 'n' ) ch = '\n'; else if( ch2 == 'r' ) ch = '\r'; else if( ch2 == '(' ) ch = '<'; else if( ch2 == ')' ) ch = '>'; else if( ch2 == '`' ) ch = '\''; else if( ch2 == '~' ) ch = '"'; else { assert false; ch = ch2; } if( ch != ch2 ) b.setCharAt( c2, ch ); b.deleteCharAt( c1 ); } } }