003import com.hp.hpl.jena.query.*;
004import com.hp.hpl.jena.rdf.model.*;
005import java.io.*;
006import java.net.*;
007import java.text.*;
008import java.util.*;
009import java.util.regex.*;
010import javax.ws.rs.core.UriBuilder;
011import javax.xml.stream.*;
012import votorola.a.*;
013import votorola.g.*;
014import votorola.g.hold.*;
015import votorola.g.io.*;
016import votorola.g.lang.*;
017import votorola.g.logging.*;
018import votorola.g.net.*;
019import votorola.g.text.*;
022/** A directory of "semantic" data cached from the pollwiki
023  * <code>~/votorola/in/wiki</code>.  Cache files are created with broad permissions and
024  * may be overwritten by any runtime owner (vote-server account, servlet container and
025  * others).  The administrator may safely delete the contained files at runtime without
026  * causing unrecoverable errors, but should not delete the directory itself until after
027  * shutting down all runtime processes.
028  *
029  *     @see <a href='../../../../s/manual.xht#wiki'
030  *                           >../s/manual.xht#wiki</a>
031  */
032public @ThreadSafe final class WikiCache extends File
036    /** Constructs a WikiCache.
037      */
038    WikiCache( VoteServer _voteServer ) throws IOException
039    {
040        super( _voteServer.inDirectory(), "wiki" );
041        voteServer = _voteServer;
043        if( !exists() ) init_create();
044    }
048    private final Churn init_create() throws IOException
049    {
050        if( !mkdir() ) throw new IOException( "unable to create directory: " + WikiCache.this );
052        setWritable( true, /*ownerOnly*/false );
053        return churn( /*fromScratch*/true );
054    }
058   // ------------------------------------------------------------------------------------
061    /** Replaces any stale pages in the cache with fresh copies from the wiki.  This
062      * should be called periodically, e.g. via one of the administrative commands (such
063      * as votrace) that has a <code>--churn</code> option.  However it should not be
064      * called too often as it has the side effect of clearing the poll cache.  You may
065      * instead want to query the pollwiki directly; Semantic MediaWiki 1.7 introduces a
066      * MediaWiki API extension that might be more convenient than the RDF interface.
067      *
068      * <p>Note: only changes to page content are detected.  A property change owing to a
069      * change in the content of a template will not be detected.  This is a BUG and the
070      * current workaround is to manually delete the page in the cache.</p>
071      *
072      *     @return this wiki cache.
073      *
074      *     @see #lastChurnTime()
075      *     @see votorola.a.count.PollService.VoteServerScope.Run#ensurePoll(String)
076      */
077    public WikiCache churn() throws IOException
078    {
079        churn( /*fromScratch*/false );
080        return WikiCache.this;
081    }
085    /** Returns the time of the last churn based on the timestamp of the churn's serial
086      * file.
087      *
088      *     @return time in milliseconds since the Epoch, or 0L if unknown.
089      */
090    public long lastChurnTime() { return churnSerialFile.lastModified(); }
094    /** Opens an input stream for the RDF of the specified wiki page in JSON format, UTF-8
095      * character encoding.  Attempts first to use the local cache, falling back to the
096      * wiki if necessary and caching the result for future calls.
097      *
098      *     @param fullPageName the full name of the page, including any namespace.
099      */
100    public FileInputStream openRDF_JSON( final String fullPageName ) throws IOException
101    {
102        return openRDF_JSON( fullPageName, /*toChurn*/false );
103    }
107    /** Opens an input stream for the RDF of the specified wiki page in JSON format, UTF-8
108      * character encoding.  Attempts first to use the local cache, falling back to the
109      * wiki if necessary and caching the result for future calls.
110      *
111      *     @param fullPageName the full name of the page, including any namespace.
112      *     @param toChurn whether or not to bypass the cache and fetch the RDF straight
113      *       from the wiki.  If true, any previously cached file will be ignored and (if
114      *       the fetch succeeds) overrwritten.
115      */
116    public FileInputStream openRDF_JSON( final String fullPageName, final boolean toChurn )
117      throws IOException
118    {
119        final long msChurnBefore = toChurn? Long.MAX_VALUE: Long.MIN_VALUE;
120        retry: for( int retryCount = 0;; ++retryCount )
121        {
122            final File cacheFile = ensure( fullPageName, msChurnBefore, /*allowNewFile*/true );
123            try { return  new FileInputStream( cacheFile ); }
124            catch( final FileNotFoundException x )
125            {
126                if( cacheFile.exists() || retryCount > 0 ) throw x;
127                // Else retry.  The cacheFile was there earlier (ensured above), so it was
128                // probably deleted by the administrator.  A single retry should suffice.
129            }
130        }
131    }
135    /** Fetches the RDF of the specified wiki page in JSON format.  Attempts first to use
136      * the local cache, falling back to the wiki if necessary and caching the result for
137      * future calls.
138      *
139      *     @param fullPageName the full name of the page, including any namespace.
140      */
141    public String readRDF_JSON( final String fullPageName ) throws IOException
142    {
143        return readRDF_JSON( fullPageName, /*toChurn*/false );
144    }
148    /** Fetches the RDF of the specified wiki page in JSON format.  Attempts first to use
149      * the local cache, falling back to the wiki if necessary and caching the result for
150      * future calls.
151      *
152      *     @param fullPageName the full name of the page, including any namespace.
153      *     @param toChurn whether or not to bypass the cache and fetch the RDF straight
154      *       from the wiki.  If true, any previously cached file will be ignored and (if
155      *       the fetch succeeds) overrwritten.
156      */
157    public String readRDF_JSON( final String fullPageName, final boolean toChurn )
158      throws IOException
159    {
160        final BufferedReader in = new BufferedReader( new InputStreamReader(
161          openRDF_JSON(fullPageName,toChurn), "UTF-8" ));
162        try { return ReaderX.appendTo( new StringBuilder(), in ).toString(); }
163        finally{ in.close(); }
164    }
168//// P r i v a t e ///////////////////////////////////////////////////////////////////////
171    private final Churn churn( final boolean fromScratch ) throws IOException
172    {
173        final String rclimit;
174        final String rcendQueryComponent; // specifying the earliest to churn
175        if( fromScratch )
176        {
177            rclimit = "1"; // just enough to set tsLatestChurned (below)
178            rcendQueryComponent = ""; // no need, fetching just one
179        }
180        else
181        {
182            rclimit = "500"; // per query, max for ordinary user
183            Churn lastChurn = Churn.readObject( WikiCache.this );
184            if( lastChurn == null )
185            {
186                LoggerX.i(getClass()).config( "lost churn history, cleaning out entire cache" );
187                if( !FileX.deleteRecursive( WikiCache.this )) throw new IOException( "unable to delete directory (please delete it manually): " + WikiCache.this );
189                return init_create();
190            }
192            rcendQueryComponent = "&rcend=" + lastChurn.tsLatestChurned; // oldest, listed new to old
193              // Churning at the last timestamp of the previous churn.  If any pages at
194              // that timestamp are actually present in the cache, they will be churned
195              // again, perhaps redundantly.  This is needed to ensure no change slips
196              // through in a burst of multiple changes that all have the same timestamp.
197        }
199        long msChurnBefore = Long.MAX_VALUE;
200          // reset below, will prevent repeat churning of multiply changed/listed pages
201        String tsLatestChurned = null;  // so far
202        queryChain: for( String queryContinuation = "";; )
203        {
204            // Vetting is not yet implemented in churns.  No revisions are excluded, so
205            // the the server is unshielded from abusive edits.  We might implement
206            // vetting based on a cooling-off period (P).  This would ignore recent
207            // revisions (now - P), allowing time for abuse to be detected and corrected.
208            // This would require that each query overlap the previous by at least P, in
209            // order that any rejected revisions were again reconsidered for churning.
210            // Bypass of P might be allowed for sysop changes, user changes in user pages,
211            // leader changes in polls, and so forth.  Deliberate churns (single page
212            // reconstructions by user request) would have to abide by complementary
213            // rules. All of this would be somewhat restricted by Semantic MediaWiki's
214            // limitations.  RDF export applies only to the current page revision, so only
215            // that one can ever be accepted for churning.  Even if an earlier revision
216            // had cooled for P, if the current had not, then neither could be accepted
217            // for churning.
219            final Spool spool = new Spool1();
220            try
221            {
222                final HttpURLConnection http;
223                try
224                {
225                    // http://www.mediawiki.org/wiki/API:Query_-_Lists#recentchanges_.2F_rc
226                    final URI s = voteServer.pollwiki().scriptURI();
227                    final URI queryURI = new URI(
228                      s.getScheme(), s.getAuthority(), s.getPath() + "/api.php",
229                      /*query*/"action=query&list=recentchanges"
230                        + rcendQueryComponent
231                        + "&rclimit=" + rclimit + "&rcprop=title|timestamp&rctype=edit&format=xml"
232                        + queryContinuation,
233                      /*fragment*/null );
234                    LoggerX.i(getClass()).fine( "querying wiki " + queryURI );
235                    http = (HttpURLConnection)( queryURI.toURL().openConnection() );
236                }
237                catch( URISyntaxException x ) { throw new RuntimeException( x ); }
239                URLConnectionX.connect( http );
240                spool.add( new Hold()
241                {
242                    public void release() { http.disconnect(); }
243                });
245                final InputStream in = http.getInputStream();
246                spool.add( new Hold()
247                {
248                    public void release() { try{ in.close(); } catch( Exception x ) { throw VotorolaRuntimeException.castOrWrapped( x ); }}
249                });
251                final XMLStreamReader r = MediaWiki.newXMLStreamReader( in, spool );
252                if( msChurnBefore == Long.MAX_VALUE ) msChurnBefore = System.currentTimeMillis();
253                  // After having detected the most recent change, so no gap in which
254                  // stale files might get cached and retained.
256                queryContinuation = null;
257                while( r.hasNext() )
258                {
259                    r.next();
260                    if( r.isStartElement() && "rc".equals( r.getLocalName() ))
261                    {
262                        ensure( r.getAttributeValue(/*ns*/null,"title"), msChurnBefore,
263                          /*allowNewFile*/false );
264                        if( tsLatestChurned == null ) // then this is the last one (first in list)
265                        {
266                            tsLatestChurned = r.getAttributeValue( /*ns*/null, "timestamp" );
267                        }
268                    }
269                    else if( !fromScratch && r.isStartElement()
270                      && "recentchanges".equals( r.getLocalName() ))
271                    {
272                        final String rcstart = r.getAttributeValue( /*ns*/null, "rcstart" );
273                        if( rcstart != null ) // also serves to gaurd against clobbering.  Up to two elements are expected with this same name, only one of which has the sought for attribute.
274                        {
275                            queryContinuation = "&rcstart=" + rcstart;
276                        }
277                    }
278                    else if( r.isStartElement() && "error".equals( r.getLocalName() ))
279                    {
280                        throw new MediaWiki.APIError( r );
281                    }
282                }
283                if( queryContinuation == null ) break queryChain;
284            }
285            catch( XMLStreamException x ) { throw new IOException( x ); }
286            finally{ spool.unwind(); }
287        }
289        if( tsLatestChurned == null ) // then it must be a brand new wiki, with no changes
290        {
291            final SimpleDateFormat iso8601Formatter =
292           // new SimpleDateFormat( SimpleDateFormatX.ISO_8601_PATTERN_C );
293           /// but MediaWiki cannot parse 2010-05-02T18:08:01-0400, so use GMT and 'Z' suffix
294              new SimpleDateFormat( SimpleDateFormatX.ISO_8601_LOCAL_PATTERN + "'Z'" );
295            iso8601Formatter.setTimeZone( TimeZone.getTimeZone( "GMT" ));
297            tsLatestChurned = iso8601Formatter.format( new Date(
298              System.currentTimeMillis() - // back far enough to cover clock mis-sync between hosts
299                1000/*ms per s*/ * 3600/*s per hour*/ * 24/*hours per day*/ * 7/*days*/ ));
300        }
302        final Churn churn = new Churn( tsLatestChurned );
303        churn.writeObject( WikiCache.this );
304        return churn;
305    }
309    private final File churnSerialFile = new File( WikiCache.this, Churn.SERIAL_FILE_NAME );
313    /** @param msChurnBefore the earliest modtime acceptable without churning.  If the
314      *   cached file is older, it will be churned.
315      * @param allowNewFile whether to allow the addition of a new file, or only churning
316      *   of the existing one.
317      * @return the corresponding file from the cache; or null if allowNewFile is false,
318      *    and the file did not already exist in the cache.
319      */
320    private File ensure( final String fullPageName, final long msChurnBefore,
321      final boolean allowNewFile ) throws IOException
322    {
323        // Currently this caches JSON file of the entire RDF export, which is huge.  In
324        // future we'll support a soft-scripted transform of the result to a minimal JSON
325        // (or perhaps any format) that will then be cached and returned.
327        final File cacheFile = newCacheFile( fullPageName );
328        if( !cacheFile.exists() )
329        {
330            if( !allowNewFile ) return null;
332            FileX.traverse( cacheFile, // create parent directories and make them writable by all
333                new FileFilter() // up
334                {
335                    public boolean accept( final File f ) { return !f.exists(); }
336                },
337                new FileFilter() // down
338                {
339                    public boolean accept( final File f )
340                    {
341                        if( f != cacheFile )
342                        {
343                            f.mkdir();
344                            f.setWritable( true, /*ownerOnly*/false );
345                        }
347                        return true;
348                    }
349                });
350        }
351        if( !cacheFile.exists() || cacheFile.lastModified() < msChurnBefore )
352        {
353            final Model data = ModelFactory.createDefaultModel();
354            final Spool spool = new Spool1();
355            try
356            {
357                final InputStream in = new BufferedInputStream( newRDFImportStream( fullPageName,
358                  spool ));
359                spool.add( new Hold()
360                {
361                    public void release() { try{ in.close(); } catch( Exception x ) { throw VotorolaRuntimeException.castOrWrapped( x ); }}
362                });
363                data.read( in, /*base, not needed, no relative URLs*/null );
364            }
365            finally{ spool.unwind(); }
366            final File tmpFile = File.createTempFile( cacheFile.getName(), ".json" );
367            try
368            {
369                final Query q = QueryFactory.create( "SELECT * WHERE { ?s ?p ?o }" ); // http://tech.groups.yahoo.com/group/jena-dev/message/23035
370                final QueryExecution qexec = QueryExecutionFactory.create( q, data );
371                final ResultSet rs = qexec.execSelect();
372                final BufferedOutputStream out = new BufferedOutputStream(
373                  new FileOutputStream( tmpFile ));
374                try
375                {
376                    ResultSetFormatter.outputAsJSON( out, rs ); // appears to output UTF-8
377                }
378                finally{ out.close(); }
379                tmpFile.setWritable( true, /*ownerOnly*/false );
380             // cacheFile.delete(); // non-atomic with rename, but rename alone should work:
381                FileX.renameFromDefaultToMv( tmpFile, cacheFile );
382            }
383            finally{ if( tmpFile.isFile() ) tmpFile.delete(); } // clean up from exception
384        }
385        return cacheFile;
386    }
390    private File newCacheFile( final String fullPageName )
391    {
392        final MatchResult m = MediaWiki.parsePageName( fullPageName );
393        if( m == null ) throw new VotorolaRuntimeException( "malformed page name: " + fullPageName );
395        String namespace = m.group( 1 );
396        if( namespace == null ) namespace = "Main";
398        final String pageName = m.group( 2 );
399        return new File( WikiCache.this, namespace + File.separator +
400          (File.separatorChar == '/'? pageName: pageName.replaceAll("/",File.separator)) + ".json" );
401    }
405    /** @param spool the spool for closing resources.
406      * @return the stream from which to read the RDF.  Close it when you are finished
407      *   with it.
408      */
409    private InputStream newRDFImportStream( final String fullPageName, final Spool spool )
410      throws IOException
411    {
412        final UriBuilder ub = UriBuilder.fromUri( voteServer.pollwiki().scriptURI() );
413        ub.path( "index.php" );
414     // ub.queryParam( "page", fullPageName );
415     /// ignored by Semantic MediaWiki 1.7.1, it instead serves query form.  so append as subpage:
416        ub.queryParam( "title", "Special:ExportRDF/" + fullPageName );
417        ub.queryParam( "backlinks", "0" );
418        ub.queryParam( "recursive", "0" );
419        final URI uri = ub.build();
420        LoggerX.i(getClass()).fine( "querying wiki " + uri );
421        final HttpURLConnection http = (HttpURLConnection)( uri.toURL().openConnection() );
423        URLConnectionX.connect( http );
424        spool.add( new Hold()
425        {
426            public void release() { http.disconnect(); }
427        });
428        return http.getInputStream();
429    }
433    private final VoteServer voteServer;
437   // ====================================================================================
440    private static final class Churn implements Serializable
441    {
443        private static final long serialVersionUID = 0L;
446        private Churn( String _tsLatestChurned )
447        {
448            if( _tsLatestChurned == null ) throw new NullPointerException(); // fail fast
450            tsLatestChurned = _tsLatestChurned;
451        }
454       // ````````````````````````````````````````````````````````````````````````````````
457        static Churn readObject( final WikiCache wikiCache ) throws IOException
458        {
459            final File serialFile = wikiCache.churnSerialFile;
460            if( !serialFile.isFile() ) return null;
462            try
463            {
464                return (Churn)FileX.readObject( serialFile );
465            }
466            catch( ClassNotFoundException x ) { throw new RuntimeException( x ); }
467        }
470        final void writeObject( final WikiCache wikiCache ) throws IOException
471        {
472            final File serialFile = wikiCache.churnSerialFile;
473            FileX.writeObject( Churn.this, serialFile);
474            serialFile.setWritable( true, /*ownerOnly*/false );
475        }
478       // --------------------------------------------------------------------------------
481        private static final String SERIAL_FILE_NAME = "lastChurn.serial";
484        /** The timestamp of the latest page revision in this churn.
485          */
486        private final String tsLatestChurned;
489    }