001package votorola.a; // Copyright 2010-2013, Michael Allan.  Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Votorola Software"), to deal in the Votorola Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicence, and/or sell copies of the Votorola Software, and to permit persons to whom the Votorola Software is furnished to do so, subject to the following conditions: The preceding copyright notice and this permission notice shall be included in all copies or substantial portions of the Votorola Software. THE VOTOROLA SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE VOTOROLA SOFTWARE OR THE USE OR OTHER DEALINGS IN THE VOTOROLA SOFTWARE.
002
003import com.hp.hpl.jena.query.*;
004import com.hp.hpl.jena.rdf.model.*;
005import java.io.*;
006import java.net.*;
007import java.text.*;
008import java.util.*;
009import java.util.regex.*;
010import javax.ws.rs.core.UriBuilder;
011import javax.xml.stream.*;
012import votorola.a.*;
013import votorola.g.*;
014import votorola.g.hold.*;
015import votorola.g.io.*;
016import votorola.g.lang.*;
017import votorola.g.logging.*;
018import votorola.g.net.*;
019import votorola.g.text.*;
020
021
022/** A directory of "semantic" data cached from the pollwiki
023  * <code>~/votorola/in/wiki</code>.  Cache files are created with broad permissions and
024  * may be overwritten by any runtime owner (vote-server account, servlet container and
025  * others).  The administrator may safely delete the contained files at runtime without
026  * causing unrecoverable errors, but should not delete the directory itself until after
027  * shutting down all runtime processes.
028  *
029  *     @see <a href='../../../../s/manual.xht#wiki'
030  *                           >../s/manual.xht#wiki</a>
031  */
032public @ThreadSafe final class WikiCache extends File
033{
034
035
036    /** Constructs a WikiCache.
037      */
038    WikiCache( VoteServer _voteServer ) throws IOException
039    {
040        super( _voteServer.inDirectory(), "wiki" );
041        voteServer = _voteServer;
042
043        if( !exists() ) init_create();
044    }
045
046
047
048    private final Churn init_create() throws IOException
049    {
050        if( !mkdir() ) throw new IOException( "unable to create directory: " + WikiCache.this );
051
052        setWritable( true, /*ownerOnly*/false );
053        return churn( /*fromScratch*/true );
054    }
055
056
057
058   // ------------------------------------------------------------------------------------
059
060
061    /** Replaces any stale pages in the cache with fresh copies from the wiki.  This
062      * should be called periodically, e.g. via one of the administrative commands (such
063      * as votrace) that has a <code>--churn</code> option.  However it should not be
064      * called too often as it has the side effect of clearing the poll cache.  You may
065      * instead want to query the pollwiki directly; Semantic MediaWiki 1.7 introduces a
066      * MediaWiki API extension that might be more convenient than the RDF interface.
067      *
068      * <p>Note: only changes to page content are detected.  A property change owing to a
069      * change in the content of a template will not be detected.  This is a BUG and the
070      * current workaround is to manually delete the page in the cache.</p>
071      *
072      *     @return this wiki cache.
073      *
074      *     @see #lastChurnTime()
075      *     @see votorola.a.count.PollService.VoteServerScope.Run#ensurePoll(String)
076      */
077    public WikiCache churn() throws IOException
078    {
079        churn( /*fromScratch*/false );
080        return WikiCache.this;
081    }
082
083
084
085    /** Returns the time of the last churn based on the timestamp of the churn's serial
086      * file.
087      *
088      *     @return time in milliseconds since the Epoch, or 0L if unknown.
089      */
090    public long lastChurnTime() { return churnSerialFile.lastModified(); }
091
092
093
094    /** Opens an input stream for the RDF of the specified wiki page in JSON format, UTF-8
095      * character encoding.  Attempts first to use the local cache, falling back to the
096      * wiki if necessary and caching the result for future calls.
097      *
098      *     @param fullPageName the full name of the page, including any namespace.
099      */
100    public FileInputStream openRDF_JSON( final String fullPageName ) throws IOException
101    {
102        return openRDF_JSON( fullPageName, /*toChurn*/false );
103    }
104
105
106
107    /** Opens an input stream for the RDF of the specified wiki page in JSON format, UTF-8
108      * character encoding.  Attempts first to use the local cache, falling back to the
109      * wiki if necessary and caching the result for future calls.
110      *
111      *     @param fullPageName the full name of the page, including any namespace.
112      *     @param toChurn whether or not to bypass the cache and fetch the RDF straight
113      *       from the wiki.  If true, any previously cached file will be ignored and (if
114      *       the fetch succeeds) overrwritten.
115      */
116    public FileInputStream openRDF_JSON( final String fullPageName, final boolean toChurn )
117      throws IOException
118    {
119        final long msChurnBefore = toChurn? Long.MAX_VALUE: Long.MIN_VALUE;
120        retry: for( int retryCount = 0;; ++retryCount )
121        {
122            final File cacheFile = ensure( fullPageName, msChurnBefore, /*allowNewFile*/true );
123            try { return  new FileInputStream( cacheFile ); }
124            catch( final FileNotFoundException x )
125            {
126                if( cacheFile.exists() || retryCount > 0 ) throw x;
127                // Else retry.  The cacheFile was there earlier (ensured above), so it was
128                // probably deleted by the administrator.  A single retry should suffice.
129            }
130        }
131    }
132
133
134
135    /** Fetches the RDF of the specified wiki page in JSON format.  Attempts first to use
136      * the local cache, falling back to the wiki if necessary and caching the result for
137      * future calls.
138      *
139      *     @param fullPageName the full name of the page, including any namespace.
140      */
141    public String readRDF_JSON( final String fullPageName ) throws IOException
142    {
143        return readRDF_JSON( fullPageName, /*toChurn*/false );
144    }
145
146
147
148    /** Fetches the RDF of the specified wiki page in JSON format.  Attempts first to use
149      * the local cache, falling back to the wiki if necessary and caching the result for
150      * future calls.
151      *
152      *     @param fullPageName the full name of the page, including any namespace.
153      *     @param toChurn whether or not to bypass the cache and fetch the RDF straight
154      *       from the wiki.  If true, any previously cached file will be ignored and (if
155      *       the fetch succeeds) overrwritten.
156      */
157    public String readRDF_JSON( final String fullPageName, final boolean toChurn )
158      throws IOException
159    {
160        final BufferedReader in = new BufferedReader( new InputStreamReader(
161          openRDF_JSON(fullPageName,toChurn), "UTF-8" ));
162        try { return ReaderX.appendTo( new StringBuilder(), in ).toString(); }
163        finally{ in.close(); }
164    }
165
166
167
168//// P r i v a t e ///////////////////////////////////////////////////////////////////////
169
170
171    private final Churn churn( final boolean fromScratch ) throws IOException
172    {
173        final String rclimit;
174        final String rcendQueryComponent; // specifying the earliest to churn
175        if( fromScratch )
176        {
177            rclimit = "1"; // just enough to set tsLatestChurned (below)
178            rcendQueryComponent = ""; // no need, fetching just one
179        }
180        else
181        {
182            rclimit = "500"; // per query, max for ordinary user
183            Churn lastChurn = Churn.readObject( WikiCache.this );
184            if( lastChurn == null )
185            {
186                LoggerX.i(getClass()).config( "lost churn history, cleaning out entire cache" );
187                if( !FileX.deleteRecursive( WikiCache.this )) throw new IOException( "unable to delete directory (please delete it manually): " + WikiCache.this );
188
189                return init_create();
190            }
191
192            rcendQueryComponent = "&rcend=" + lastChurn.tsLatestChurned; // oldest, listed new to old
193              // Churning at the last timestamp of the previous churn.  If any pages at
194              // that timestamp are actually present in the cache, they will be churned
195              // again, perhaps redundantly.  This is needed to ensure no change slips
196              // through in a burst of multiple changes that all have the same timestamp.
197        }
198
199        long msChurnBefore = Long.MAX_VALUE;
200          // reset below, will prevent repeat churning of multiply changed/listed pages
201        String tsLatestChurned = null;  // so far
202        queryChain: for( String queryContinuation = "";; )
203        {
204            // Vetting is not yet implemented in churns.  No revisions are excluded, so
205            // the the server is unshielded from abusive edits.  We might implement
206            // vetting based on a cooling-off period (P).  This would ignore recent
207            // revisions (now - P), allowing time for abuse to be detected and corrected.
208            // This would require that each query overlap the previous by at least P, in
209            // order that any rejected revisions were again reconsidered for churning.
210            // Bypass of P might be allowed for sysop changes, user changes in user pages,
211            // leader changes in polls, and so forth.  Deliberate churns (single page
212            // reconstructions by user request) would have to abide by complementary
213            // rules. All of this would be somewhat restricted by Semantic MediaWiki's
214            // limitations.  RDF export applies only to the current page revision, so only
215            // that one can ever be accepted for churning.  Even if an earlier revision
216            // had cooled for P, if the current had not, then neither could be accepted
217            // for churning.
218
219            final Spool spool = new Spool1();
220            try
221            {
222                final HttpURLConnection http;
223                try
224                {
225                    // http://www.mediawiki.org/wiki/API:Query_-_Lists#recentchanges_.2F_rc
226                    final URI s = voteServer.pollwiki().scriptURI();
227                    final URI queryURI = new URI(
228                      s.getScheme(), s.getAuthority(), s.getPath() + "/api.php",
229                      /*query*/"action=query&list=recentchanges"
230                        + rcendQueryComponent
231                        + "&rclimit=" + rclimit + "&rcprop=title|timestamp&rctype=edit&format=xml"
232                        + queryContinuation,
233                      /*fragment*/null );
234                    LoggerX.i(getClass()).fine( "querying wiki " + queryURI );
235                    http = (HttpURLConnection)( queryURI.toURL().openConnection() );
236                }
237                catch( URISyntaxException x ) { throw new RuntimeException( x ); }
238
239                URLConnectionX.connect( http );
240                spool.add( new Hold()
241                {
242                    public void release() { http.disconnect(); }
243                });
244
245                final InputStream in = http.getInputStream();
246                spool.add( new Hold()
247                {
248                    public void release() { try{ in.close(); } catch( Exception x ) { throw VotorolaRuntimeException.castOrWrapped( x ); }}
249                });
250
251                final XMLStreamReader r = MediaWiki.newXMLStreamReader( in, spool );
252                if( msChurnBefore == Long.MAX_VALUE ) msChurnBefore = System.currentTimeMillis();
253                  // After having detected the most recent change, so no gap in which
254                  // stale files might get cached and retained.
255
256                queryContinuation = null;
257                while( r.hasNext() )
258                {
259                    r.next();
260                    if( r.isStartElement() && "rc".equals( r.getLocalName() ))
261                    {
262                        ensure( r.getAttributeValue(/*ns*/null,"title"), msChurnBefore,
263                          /*allowNewFile*/false );
264                        if( tsLatestChurned == null ) // then this is the last one (first in list)
265                        {
266                            tsLatestChurned = r.getAttributeValue( /*ns*/null, "timestamp" );
267                        }
268                    }
269                    else if( !fromScratch && r.isStartElement()
270                      && "recentchanges".equals( r.getLocalName() ))
271                    {
272                        final String rcstart = r.getAttributeValue( /*ns*/null, "rcstart" );
273                        if( rcstart != null ) // also serves to gaurd against clobbering.  Up to two elements are expected with this same name, only one of which has the sought for attribute.
274                        {
275                            queryContinuation = "&rcstart=" + rcstart;
276                        }
277                    }
278                    else if( r.isStartElement() && "error".equals( r.getLocalName() ))
279                    {
280                        throw new MediaWiki.APIError( r );
281                    }
282                }
283                if( queryContinuation == null ) break queryChain;
284            }
285            catch( XMLStreamException x ) { throw new IOException( x ); }
286            finally{ spool.unwind(); }
287        }
288
289        if( tsLatestChurned == null ) // then it must be a brand new wiki, with no changes
290        {
291            final SimpleDateFormat iso8601Formatter =
292           // new SimpleDateFormat( SimpleDateFormatX.ISO_8601_PATTERN_C );
293           /// but MediaWiki cannot parse 2010-05-02T18:08:01-0400, so use GMT and 'Z' suffix
294              new SimpleDateFormat( SimpleDateFormatX.ISO_8601_LOCAL_PATTERN + "'Z'" );
295            iso8601Formatter.setTimeZone( TimeZone.getTimeZone( "GMT" ));
296
297            tsLatestChurned = iso8601Formatter.format( new Date(
298              System.currentTimeMillis() - // back far enough to cover clock mis-sync between hosts
299                1000/*ms per s*/ * 3600/*s per hour*/ * 24/*hours per day*/ * 7/*days*/ ));
300        }
301
302        final Churn churn = new Churn( tsLatestChurned );
303        churn.writeObject( WikiCache.this );
304        return churn;
305    }
306
307
308
309    private final File churnSerialFile = new File( WikiCache.this, Churn.SERIAL_FILE_NAME );
310
311
312
313    /** @param msChurnBefore the earliest modtime acceptable without churning.  If the
314      *   cached file is older, it will be churned.
315      * @param allowNewFile whether to allow the addition of a new file, or only churning
316      *   of the existing one.
317      * @return the corresponding file from the cache; or null if allowNewFile is false,
318      *    and the file did not already exist in the cache.
319      */
320    private File ensure( final String fullPageName, final long msChurnBefore,
321      final boolean allowNewFile ) throws IOException
322    {
323        // Currently this caches JSON file of the entire RDF export, which is huge.  In
324        // future we'll support a soft-scripted transform of the result to a minimal JSON
325        // (or perhaps any format) that will then be cached and returned.
326
327        final File cacheFile = newCacheFile( fullPageName );
328        if( !cacheFile.exists() )
329        {
330            if( !allowNewFile ) return null;
331
332            FileX.traverse( cacheFile, // create parent directories and make them writable by all
333                new FileFilter() // up
334                {
335                    public boolean accept( final File f ) { return !f.exists(); }
336                },
337                new FileFilter() // down
338                {
339                    public boolean accept( final File f )
340                    {
341                        if( f != cacheFile )
342                        {
343                            f.mkdir();
344                            f.setWritable( true, /*ownerOnly*/false );
345                        }
346
347                        return true;
348                    }
349                });
350        }
351        if( !cacheFile.exists() || cacheFile.lastModified() < msChurnBefore )
352        {
353            final Model data = ModelFactory.createDefaultModel();
354            final Spool spool = new Spool1();
355            try
356            {
357                final InputStream in = new BufferedInputStream( newRDFImportStream( fullPageName,
358                  spool ));
359                spool.add( new Hold()
360                {
361                    public void release() { try{ in.close(); } catch( Exception x ) { throw VotorolaRuntimeException.castOrWrapped( x ); }}
362                });
363                data.read( in, /*base, not needed, no relative URLs*/null );
364            }
365            finally{ spool.unwind(); }
366            final File tmpFile = File.createTempFile( cacheFile.getName(), ".json" );
367            try
368            {
369                final Query q = QueryFactory.create( "SELECT * WHERE { ?s ?p ?o }" ); // http://tech.groups.yahoo.com/group/jena-dev/message/23035
370                final QueryExecution qexec = QueryExecutionFactory.create( q, data );
371                final ResultSet rs = qexec.execSelect();
372                final BufferedOutputStream out = new BufferedOutputStream(
373                  new FileOutputStream( tmpFile ));
374                try
375                {
376                    ResultSetFormatter.outputAsJSON( out, rs ); // appears to output UTF-8
377                }
378                finally{ out.close(); }
379                tmpFile.setWritable( true, /*ownerOnly*/false );
380             // cacheFile.delete(); // non-atomic with rename, but rename alone should work:
381                FileX.renameFromDefaultToMv( tmpFile, cacheFile );
382            }
383            finally{ if( tmpFile.isFile() ) tmpFile.delete(); } // clean up from exception
384        }
385        return cacheFile;
386    }
387
388
389
390    private File newCacheFile( final String fullPageName )
391    {
392        final MatchResult m = MediaWiki.parsePageName( fullPageName );
393        if( m == null ) throw new VotorolaRuntimeException( "malformed page name: " + fullPageName );
394
395        String namespace = m.group( 1 );
396        if( namespace == null ) namespace = "Main";
397
398        final String pageName = m.group( 2 );
399        return new File( WikiCache.this, namespace + File.separator +
400          (File.separatorChar == '/'? pageName: pageName.replaceAll("/",File.separator)) + ".json" );
401    }
402
403
404
405    /** @param spool the spool for closing resources.
406      * @return the stream from which to read the RDF.  Close it when you are finished
407      *   with it.
408      */
409    private InputStream newRDFImportStream( final String fullPageName, final Spool spool )
410      throws IOException
411    {
412        final UriBuilder ub = UriBuilder.fromUri( voteServer.pollwiki().scriptURI() );
413        ub.path( "index.php" );
414     // ub.queryParam( "page", fullPageName );
415     /// ignored by Semantic MediaWiki 1.7.1, it instead serves query form.  so append as subpage:
416        ub.queryParam( "title", "Special:ExportRDF/" + fullPageName );
417        ub.queryParam( "backlinks", "0" );
418        ub.queryParam( "recursive", "0" );
419        final URI uri = ub.build();
420        LoggerX.i(getClass()).fine( "querying wiki " + uri );
421        final HttpURLConnection http = (HttpURLConnection)( uri.toURL().openConnection() );
422
423        URLConnectionX.connect( http );
424        spool.add( new Hold()
425        {
426            public void release() { http.disconnect(); }
427        });
428        return http.getInputStream();
429    }
430
431
432
433    private final VoteServer voteServer;
434
435
436
437   // ====================================================================================
438
439
440    private static final class Churn implements Serializable
441    {
442
443        private static final long serialVersionUID = 0L;
444
445
446        private Churn( String _tsLatestChurned )
447        {
448            if( _tsLatestChurned == null ) throw new NullPointerException(); // fail fast
449
450            tsLatestChurned = _tsLatestChurned;
451        }
452
453
454       // ````````````````````````````````````````````````````````````````````````````````
455
456
457        static Churn readObject( final WikiCache wikiCache ) throws IOException
458        {
459            final File serialFile = wikiCache.churnSerialFile;
460            if( !serialFile.isFile() ) return null;
461
462            try
463            {
464                return (Churn)FileX.readObject( serialFile );
465            }
466            catch( ClassNotFoundException x ) { throw new RuntimeException( x ); }
467        }
468
469
470        final void writeObject( final WikiCache wikiCache ) throws IOException
471        {
472            final File serialFile = wikiCache.churnSerialFile;
473            FileX.writeObject( Churn.this, serialFile);
474            serialFile.setWritable( true, /*ownerOnly*/false );
475        }
476
477
478       // --------------------------------------------------------------------------------
479
480
481        private static final String SERIAL_FILE_NAME = "lastChurn.serial";
482
483
484        /** The timestamp of the latest page revision in this churn.
485          */
486        private final String tsLatestChurned;
487
488
489    }
490
491
492
493}