package textbender.a.u.encoding; // Copyright 2007, Michael Allan. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Textbender Software"), to deal in the Textbender Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicence, and/or sell copies of the Textbender Software, and to permit persons to whom the Textbender Software is furnished to do so, subject to the following conditions: The preceding copyright notice and this permission notice shall be included in all copies or substantial portions of the Textbender Software. THE TEXTBENDER SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE TEXTBENDER SOFTWARE OR THE USE OR OTHER DEALINGS IN THE TEXTBENDER SOFTWARE. import java.io.*; import java.util.*; import javax.xml.transform.*; import org.w3c.dom.*; import org.w3c.dom.ls.*; import textbender._.*; import textbender.a.u.transfer.clipboard.*; import textbender.d.gene.*; import textbender.d.gene.xhtml.*; import textbender.g.util.*; import textbender.g.xml.dom.*; import textbender.g.xml.dom.bootstrap.*; import static textbender._.Textbender.TEXTBENDER_NAMESPACE; import static textbender.d.gene.Gene.G_INDEX_FLAG_DOUBLE_DASH; import static textbender.d.gene.Gene.G_INDEX_FLAG_NONE; import static textbender.o.xhtml.XHTML.XHTML_NAMESPACE; /** A genetic encoder. */ public final class Encoder { /** Constructs an Encoder for a document. * * @param gg the 'g' list ('gg') element of the document * @param gList live list of gene meta-data ('g') elements in 'gg' * (e.g. backed by a NodeList) * @param clipGeneReplacer to use * @param stringBuilder string builder to use, overwriting its existing content * @param uuidStringifier UUID stringifier to use */ Encoder( Element gg, List gList, Replacer clipGeneReplacer, StringBuilder stringBuilder, UUIDStringifier uuidStringifier ) { this.gg = gg; this.gList = gList; this.clipGeneReplacer = clipGeneReplacer; _stringBuilder = stringBuilder; this.uuidStringifier = uuidStringifier; document = gg.getOwnerDocument(); } // ------------------------------------------------------------------------------------ /** Returns a Recombinant XHTML document read from an input stream, * and then {@linkplain #encodeRecursively encoded}. Convenience method. * * @param in stream to read document from * @param transformerFactory to use * @param errorHandler to use * * @return document, suitable for passing to * RecombinantXHTML.{@linkplain textbender.d.gene.xhtml.RecombinantXHTML#write(Document,TransformerFactory,Result) write} * * @throws IOException from Replacer * to the provided errorHandler * @throws TextbenderException if document not in recombinant form * @throws TransformerException from replaceRecursively, new Replacer */ public static Document encoded( final InputStream in, final TransformerFactory transformerFactory, final DOMErrorHandler errorHandler ) throws IOException, LSException, TextbenderException, TransformerException { // Read from input. // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Document document = null; // till read final DOMImplementationLS domLS = DOMImplementationRegistryX.implementationLS(); final LSParser parser = domLS.createLSParser ( domLS.MODE_SYNCHRONOUS, "http://www.w3.org/TR/REC-xml" ); { final DOMConfiguration domConfig = parser.getDomConfig(); domConfig.setParameter( "error-handler", errorHandler ); // domConfig.setParameter( "validate", true ); domConfig.setParameter ( "resource-resolver", // new textbender.d.gene.xhtml.RecombinantXHTML.DOMResourceResolver( domLS ) new textbender.d.gene.xhtml.RecombinantXHTML.DOMResourceResolverMin( domLS ) ); final LSInput lsInput = domLS.createLSInput(); lsInput.setByteStream( in ); document = parser.parse( lsInput ); } // Encode. // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - { // final Element metaData = DocumentRT.findMetaData( document ); // if( metaData == null ) throw new TextbenderException( DocumentRT.FIND_META_DATA_FAILURE_MESSAGE ); // final StringBuilder b = new StringBuilder(); final Element metaData = RecombinantXHTML.ensureMetaData( document, b ); final Element gg = Gene.ensureGG( metaData, b ); final List gList = new ElementListNL ( gg.getElementsByTagNameNS( TEXTBENDER_NAMESPACE, "g" )); final UUIDStringifier uuidStringifier = new UUIDStringifier(); Replacer clipGeneReplacer = new Replacer ( gg, gList, b, domLS, RecombinantXHTML.i(), parser, transformerFactory, uuidStringifier ); Encoder encoder = new Encoder ( gg, gList, clipGeneReplacer, b, uuidStringifier ); encoder.encodeRecursively( document ); } return document; } /** Encodes the node and its descendants. */ public void encodeRecursively( final Node node ) throws TransformerException // TransformerException replaceRecursively { clipGeneReplacer.replaceRecursively( node ); // Clone meta-data of partially cloned genes. // Done early (before writing any meta-data) to preclude collisions. // // OPT. May create unecessary meta-data in cases where a cloned gene // is temporary, to be deleted (e.g. when spliced into another gene) // during subsequent encoding. In future, we could detect collisions // on the fly by tracking which gene has written // to each meta-data 'g' record. Meta-data could then be cloned on the fly, // only where necessary. And a final call to cloneMetaRecursively() // would then be made to catch all the rest; // no temporary genes would exist at that time. // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - cloneMetaRecursively ( node, new ArrayList( /*initial capacity*/gList.size() + 10/*cloned genes, guess*/ ) ); // Encode. // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - encodeRecursively_1_marginal( node ); encodeRecursively_2_ordinary( node ); } /** Answers whether the element is abutted with the left margin. */ public static boolean isMarginal( final Element element ) { final Node sib = element.getPreviousSibling(); if( sib == null ) return !( element.getParentNode() instanceof Element ); // with no previous sibling, it is marginal only if it is the first element in document order (has no parent) if( !( sib instanceof Text )) return false; final String textData = ((Text)sib).getData(); return textData.charAt( textData.length() - 1 ) == '\n'; } //// P r i v a t e /////////////////////////////////////////////////////////////////////// private final Replacer clipGeneReplacer; private int cloneMeta( final Element gene, int gIndex ) { Element g = (Element)gList.get( gIndex ).cloneNode( /*deeply*/true ); gIndex = gList.size(); Gene.linkG( g, gene, gg, gIndex, _stringBuilder ); return gIndex; } /** Clones the meta-data of partially cloned genes. * A partially cloned gene is one whose element was duplicated manually * by the user, resulting in two elements that share the same g-index, * and hence the same meta-data. This would cause problems * if it were not corrected. To correct it, * this method clones the meta-data for each partially cloned gene, * and assigns a new, unique g-index. * * @param node to process, deeply * @param geneList sparse list of genes (indexed by g-index) * processed thus far */ private void cloneMetaRecursively( final Node node, final ArrayList geneList ) { if( node instanceof Element ) { final Element element = (Element)node; int gIndex = Gene.gIndexOf( element ); if( gIndex >= 0 ) { if( geneList.size() > gIndex && geneList.get(gIndex) != null ) { gIndex = cloneMeta( element, gIndex ); } geneList.ensureCapacity( gIndex + 1 ); while( geneList.size() <= gIndex ) geneList.add( null ); // ensure size gIndex+1 geneList.set( gIndex, element ); } } for( Node child = node.getFirstChild(); child != null; child = child.getNextSibling() ) { cloneMetaRecursively( child, geneList ); } } private static final String DEFAULT_LINE_LEAF_ELEMENT_NAME = "div"; private static final String DEFAULT_LINE_LEAF_ELEMENT_NAMESPACE = XHTML_NAMESPACE; private static final String DEFAULT_LINE_LEAF_RIGHT_PADDING = " "; private static final int DEFAULT_TAB_LENGTH = 8; private final Document document; /** If the element is a marginal leaf gene, * gives it a pass of genetic encoding as such. * Only called from context of encodeRecursively, * q.v. for assumed preprocessing. * * @return true iff element is a gene in marginal leaf form */ private boolean encode_isMarginalLeaf( final Element element, final int gIndex ) { return false; // till implemented } /** If the sequence is in marginal leaf form, * gives it a pass of genetic encoding as such. * Only called from context of encode_isMarginalLeafParent, * q.v. for assumed preprocessing. * * @param _bound0 start bound (exclusive) of sequence * @param _bound1 end bound (exclusive) of sequence * * @return true iff sequence is in marginal leaf form */ private boolean encode_isMarginalLeafBetween( DOMPointer _bound0, DOMPointer _bound1 ) { final DOMPointer p0 = new DOMPointer().setFrom( _bound0 ); final DOMPointer p1 = new DOMPointer().setFrom( _bound1 ); if( p0.equals( p1 )) { assert false: "empty sequence"; return false; } p0.nextTextOffsetOrNode(); // p0 is now an inclusive limit // Exclude anything insignificant at sequence end. // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - for( ;; ) { if( p1.equals( p0 )) return false; // empty sequence p1.previousTextOffsetOrNode(); // p1 is now an inclusive limit, like p0 if( isSequenceEdge( p1 )) break; } // Is sequence a marginal leaf? // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - for( Node node = p0.node, end = null; end == null; node = node.getNextSibling() ) { if( node.equals( p1.node )) end = node; if( node instanceof Text ) continue; // assume already checked for newlines by caller // Reject if it contains a gene. Leaves contain no genes. // ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` if( node instanceof Element && Gene.isGene( (Element)node )) return false; // Reject if it contains a newline; margin-leaves are single lines. // // --- --- ok // -- no // -- --- no // // --- ok // --- no // ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` if( hasNewline( node )) return false; } // If the sequence is a single element, try to convert it to a gene. // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - assert isSequenceEdge( p1 ): "p1 previously moved to right edge"; if( p1.node instanceof Element ) { DOMPointer p = new DOMPointer().setFrom( p0 ); while( !isSequenceEdge( p )) p.nextTextOffsetOrNode(); // find the left edge if( p.node.equals( p1.node )) { if( Gene.gIndexIsNonGenetic( Gene.gIndexOf( (Element)p.node ))) return false; Gene.linkG ( Gene.createG ( /*locus*/uuidStringifier.toBase64Name(UUID.randomUUID()), gg, _stringBuilder ), (Element)p.node, gg, /*gIndex*/gList.size(), _stringBuilder ); return true; } } // Otherwise, encapsulate sequence in a new gene element. // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // cut sequence text if necessary, to align on whole-node boundaries // ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` if( p0.node instanceof Text && p0.offset > 0 ) { final Text trailingText = ((Text)p0.node).splitText( p0.offset ); // p0.node is leading text if( p1.node.equals( p0.node )) // correct for split { p1.node = trailingText; p1.offset -= p0.offset; } p0.node = trailingText; p0.offset = 0; // though never read } if( p1.node instanceof Text && p1.offset < ((Text)p1.node).getLength() - 1 ) { ((Text)p1.node).splitText( p1.offset + 1 ); p1.offset = ((Text)p1.node).getLength() - 1; // though never read } // insert new gene element // ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` final Node parent = p0.node.getParentNode(); final Element gene = document.createElementNS ( DEFAULT_LINE_LEAF_ELEMENT_NAMESPACE, DOM.buildElementPrefix( parent, DEFAULT_LINE_LEAF_ELEMENT_NAMESPACE, _stringBuilder ) .append( DEFAULT_LINE_LEAF_ELEMENT_NAME ).toString() ); parent.insertBefore( gene, p0.node ); Gene.linkG ( Gene.createG ( /*locus*/uuidStringifier.toBase64Name(UUID.randomUUID()), gg, _stringBuilder ), gene, gg, /*gIndex*/gList.size(), _stringBuilder ); // move sequence into gene element // ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` for( Node node;; ) { node = gene.getNextSibling(); // if( node == null ) { assert false: "leapt p1"; break; } // caught by NullPointer // gene.appendChild( parent.removeChild( node )); gene.appendChild( node ); if( node.equals( p1.node )) break; } // delete leading whitespace, so gene start tag effectively overwrites it, // leaving sequence text at its original position, as far as possible // //
// // // ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` if( p0.node instanceof Text ) { final int tagLength; { Attr gAttribute = gene.getAttributeNodeNS( TEXTBENDER_NAMESPACE, "g" ); tagLength = 1 // < + gene.getTagName().length() + 1 + gAttribute.getName().length() + 2 // =" + gAttribute.getValue().length() + 2; // "> } final Text text = (Text)p0.node; final String wholeTextData = text.getWholeText(); int nToDelete = 0; int tagLengthRemaining = tagLength; // so far for( ;; ) { if( nToDelete >= wholeTextData.length() ) break; final char ch = wholeTextData.charAt( nToDelete ); if( ch == ' ' ) tagLengthRemaining -= 1; else if( ch == '\t' ) { final int column = tagLength - tagLengthRemaining; // characters written final int tabThreshold = DEFAULT_TAB_LENGTH - column % DEFAULT_TAB_LENGTH; // characters it will take to spring it forward if( tagLengthRemaining < tabThreshold ) break; // no need to delete tab, it will absorb all remaining characters tagLengthRemaining -= DEFAULT_TAB_LENGTH; } else break; ++nToDelete; if( tagLengthRemaining <= 0 ) break; } if( nToDelete > 0 ) text.replaceWholeText( wholeTextData.substring( nToDelete )); } // pad the sequence's right-edge, from the end tag // ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` gene.appendChild( document.createTextNode( DEFAULT_LINE_LEAF_RIGHT_PADDING )); return true; } /** If the element is in marginal-leaf parent form, * gives it a pass of genetic encoding as such. * Only called from context of encodeRecursively, * q.v. for assumed preprocessing. * * @return true iff element is in marginal-leaf parent form * (which does not necessarily mean it is a parent, * or anything else, except the form is right) */ private boolean encode_isMarginalLeafParent( final Element element, int gIndex ) { boolean isMarginalLeafParent = false; // till proven otherwise // A marginal-leaf parent's start tag must trail the line, // with no subsequent characters other than spaces or tabs, // comments, or processing instructions. // // // --- // // // // The end tag must be on another line. // // // // // Newlines in trailing comments and processing instructions are ignored. // // // -- // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - DOMPointer n0 = new DOMPointer().set( element.getFirstChild(), 0 ); for(;; n0.nextTextOffsetOrNode() ) { if( n0.node == null ) return isMarginalLeafParent; if( n0.node instanceof Text && ((Text)n0.node).getData().charAt(n0.offset) == '\n' ) break; if( isSequenceEdge( n0 )) return isMarginalLeafParent; } isMarginalLeafParent = true; // System.err.print( ", is parental" ); // PRIVATE // Parent. // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - if( !Gene.gIndexIsGene(gIndex) && !Gene.gIndexIsNonGenetic(gIndex) ) Gene.linkG ( Gene.createG ( /*locus*/uuidStringifier.toBase64Name(UUID.randomUUID()), gg, _stringBuilder ), element, gg, /*gIndex*/gList.size(), _stringBuilder ); // gIndex = Gene.createG_link // ( // element, /*locus*/uuidStringifier.toBase64Name(UUID.randomUUID()), // gg, /*gIndex*/gList.size(), _stringBuilder // ); // } else if( gIndex == G_INDEX_FLAG_DOUBLE_DASH ) return isMarginalLeafParent; // entire subtree non-genetic // Child lines. // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // // --- ok -- // -- ok --- // // // Only those ending in a newline are considered. // // // --- ok -- // -- no --- DOMPointer n1 = new DOMPointer(); // will point n0,n1 to newlines at edge of each child n1.lastTextOffset( element ); // work in reverse, because text will be cut (in encode_isMarginalLeafBetween), clobbering all offsets after the cut for( ;; ) // find last newline { if( ((Text)n1.node).getData().charAt(n1.offset) == '\n' ) break; n1.previousTextOffset(); if( n1.node == null ) { assert false; return isMarginalLeafParent; }; } lines: for( ;; ) { // n0 to previous newline before n1 // ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` n0.setFrom( n1 ); for( ;; ) { n0.previousTextOffset(); if( n0.node == null ) break lines; if( ((Text)n0.node).getData().charAt(n0.offset) == '\n' ) break; } // create leaf gene in this line (maybe) // ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` encode_isMarginalLeafBetween( n0, n1 ); // swap n0 and n1 // ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` ` DOMPointer spare = n0; n0 = n1; n1 = spare; } return isMarginalLeafParent; } /** Encoding pass 1, dealing with marginal encodings. */ private void encodeRecursively_1_marginal( final Node node ) throws TransformerException { final int gIndex; final Element element; if( node instanceof Element ) { element = (Element)node; gIndex = Gene.gIndexOf( element ); } else { element = null; gIndex = G_INDEX_FLAG_NONE; } // Node subtree (depth) first. // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - if( gIndex != G_INDEX_FLAG_DOUBLE_DASH ) // subtree is not marked non-genetic { for( Node child = node.getFirstChild(); child != null; child = child.getNextSibling() ) { encodeRecursively( child ); } } // Node itself. // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - if( element == null ) return; // non-element // System.err.println( "LC, e=" + element ); // PRIVATE leaderTest: // FIX, this will probably move into encode_isMarginalLeafParent // Element must be a line-leader. Its start tag // must lead the line, with no prior character in the encoding, // other than space or tab. // // // --- // --- // -- // // // An exception is made for the document element. // It is always considered a line-leader, regardless. { if( element.equals( document.getDocumentElement() )) break leaderTest; // assume document element leads line, since this cannot be detected, because DOM does not model its previous text siblings (JDK 1.6) final Node previous = element.getPreviousSibling(); if( !( previous instanceof Text )) return; final DOMPointer p = new DOMPointer().set ( previous, ((Text)previous).getLength() - 1 ); for( ;; ) { final char ch = ((Text)p.node).getData().charAt( p.offset ); if( ch == '\n' ) break; if( !( ch == ' ' || ch == '\t' )) return; p.previousTextOffsetOrNode(); if( !( p.node instanceof Text )) return; } } // System.err.print( " leads line" ); // PRIVATE if( !encode_isMarginalLeaf( element, gIndex )) // always false, till implemented { encode_isMarginalLeafParent( element, gIndex ); } // System.err.println(); // PRIVATE } /** Encoding pass 2, dealing with ordinary genes. */ private void encodeRecursively_2_ordinary( final Node node ) {} // not yet implemented private final Element gg; private final List gList; /** Returns true if a newline character exists in the node, or in its descendants. */ private static boolean hasNewline( final Node node ) { if( node instanceof CharacterData ) return hasNewline( ((CharacterData)node).getData() ); if( node instanceof ProcessingInstruction ) return hasNewline( ((ProcessingInstruction)node).getData() ); for( Node child = node.getFirstChild(); child != null; child = child.getNextSibling() ) { if( hasNewline( child )) return true; } return false; } /** Returns true if a newline character exists in the string. */ private static boolean hasNewline( String string ) { return string.indexOf('\n') >= 0; } /** Tests whether p points to content considered significant * at the leading or trailing edge of a sequence candidate. * * @return true unless p points to a comment; a processing instruction; * or a space or tab character of a Text node */ private static boolean isSequenceEdge( DOMPointer p ) { final boolean isSequenceEdge; if( p.node instanceof Text ) { char ch = ((Text)p.node).getData().charAt(p.offset); isSequenceEdge = !( ch == ' ' || ch == '\t' ); } else isSequenceEdge = !( p.node instanceof Comment || p.node instanceof ProcessingInstruction ); return isSequenceEdge; } /** Common string builder. */ private final StringBuilder _stringBuilder; /** Empties and returns the common string builder. */ private StringBuilder _stringBuilderEmpty() { _stringBuilder.delete( 0, Integer.MAX_VALUE ); return _stringBuilder; } private final UUIDStringifier uuidStringifier; }