001package votorola.a; // Copyright 2010-2013, Michael Allan. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Votorola Software"), to deal in the Votorola Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicence, and/or sell copies of the Votorola Software, and to permit persons to whom the Votorola Software is furnished to do so, subject to the following conditions: The preceding copyright notice and this permission notice shall be included in all copies or substantial portions of the Votorola Software. THE VOTOROLA SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE VOTOROLA SOFTWARE OR THE USE OR OTHER DEALINGS IN THE VOTOROLA SOFTWARE. 002 003import com.hp.hpl.jena.query.*; 004import com.hp.hpl.jena.rdf.model.*; 005import java.io.*; 006import java.net.*; 007import java.text.*; 008import java.util.*; 009import java.util.regex.*; 010import javax.ws.rs.core.UriBuilder; 011import javax.xml.stream.*; 012import votorola.a.*; 013import votorola.g.*; 014import votorola.g.hold.*; 015import votorola.g.io.*; 016import votorola.g.lang.*; 017import votorola.g.logging.*; 018import votorola.g.net.*; 019import votorola.g.text.*; 020 021 022/** A directory of "semantic" data cached from the pollwiki 023 * <code>~/votorola/in/wiki</code>. Cache files are created with broad permissions and 024 * may be overwritten by any runtime owner (vote-server account, servlet container and 025 * others). The administrator may safely delete the contained files at runtime without 026 * causing unrecoverable errors, but should not delete the directory itself until after 027 * shutting down all runtime processes. 028 * 029 * @see <a href='../../../../s/manual.xht#wiki' 030 * >../s/manual.xht#wiki</a> 031 */ 032public @ThreadSafe final class WikiCache extends File 033{ 034 035 036 /** Constructs a WikiCache. 037 */ 038 WikiCache( VoteServer _voteServer ) throws IOException 039 { 040 super( _voteServer.inDirectory(), "wiki" ); 041 voteServer = _voteServer; 042 043 if( !exists() ) init_create(); 044 } 045 046 047 048 private final Churn init_create() throws IOException 049 { 050 if( !mkdir() ) throw new IOException( "unable to create directory: " + WikiCache.this ); 051 052 setWritable( true, /*ownerOnly*/false ); 053 return churn( /*fromScratch*/true ); 054 } 055 056 057 058 // ------------------------------------------------------------------------------------ 059 060 061 /** Replaces any stale pages in the cache with fresh copies from the wiki. This 062 * should be called periodically, e.g. via one of the administrative commands (such 063 * as votrace) that has a <code>--churn</code> option. However it should not be 064 * called too often as it has the side effect of clearing the poll cache. You may 065 * instead want to query the pollwiki directly; Semantic MediaWiki 1.7 introduces a 066 * MediaWiki API extension that might be more convenient than the RDF interface. 067 * 068 * <p>Note: only changes to page content are detected. A property change owing to a 069 * change in the content of a template will not be detected. This is a BUG and the 070 * current workaround is to manually delete the page in the cache.</p> 071 * 072 * @return this wiki cache. 073 * 074 * @see #lastChurnTime() 075 * @see votorola.a.count.PollService.VoteServerScope.Run#ensurePoll(String) 076 */ 077 public WikiCache churn() throws IOException 078 { 079 churn( /*fromScratch*/false ); 080 return WikiCache.this; 081 } 082 083 084 085 /** Returns the time of the last churn based on the timestamp of the churn's serial 086 * file. 087 * 088 * @return time in milliseconds since the Epoch, or 0L if unknown. 089 */ 090 public long lastChurnTime() { return churnSerialFile.lastModified(); } 091 092 093 094 /** Opens an input stream for the RDF of the specified wiki page in JSON format, UTF-8 095 * character encoding. Attempts first to use the local cache, falling back to the 096 * wiki if necessary and caching the result for future calls. 097 * 098 * @param fullPageName the full name of the page, including any namespace. 099 */ 100 public FileInputStream openRDF_JSON( final String fullPageName ) throws IOException 101 { 102 return openRDF_JSON( fullPageName, /*toChurn*/false ); 103 } 104 105 106 107 /** Opens an input stream for the RDF of the specified wiki page in JSON format, UTF-8 108 * character encoding. Attempts first to use the local cache, falling back to the 109 * wiki if necessary and caching the result for future calls. 110 * 111 * @param fullPageName the full name of the page, including any namespace. 112 * @param toChurn whether or not to bypass the cache and fetch the RDF straight 113 * from the wiki. If true, any previously cached file will be ignored and (if 114 * the fetch succeeds) overrwritten. 115 */ 116 public FileInputStream openRDF_JSON( final String fullPageName, final boolean toChurn ) 117 throws IOException 118 { 119 final long msChurnBefore = toChurn? Long.MAX_VALUE: Long.MIN_VALUE; 120 retry: for( int retryCount = 0;; ++retryCount ) 121 { 122 final File cacheFile = ensure( fullPageName, msChurnBefore, /*allowNewFile*/true ); 123 try { return new FileInputStream( cacheFile ); } 124 catch( final FileNotFoundException x ) 125 { 126 if( cacheFile.exists() || retryCount > 0 ) throw x; 127 // Else retry. The cacheFile was there earlier (ensured above), so it was 128 // probably deleted by the administrator. A single retry should suffice. 129 } 130 } 131 } 132 133 134 135 /** Fetches the RDF of the specified wiki page in JSON format. Attempts first to use 136 * the local cache, falling back to the wiki if necessary and caching the result for 137 * future calls. 138 * 139 * @param fullPageName the full name of the page, including any namespace. 140 */ 141 public String readRDF_JSON( final String fullPageName ) throws IOException 142 { 143 return readRDF_JSON( fullPageName, /*toChurn*/false ); 144 } 145 146 147 148 /** Fetches the RDF of the specified wiki page in JSON format. Attempts first to use 149 * the local cache, falling back to the wiki if necessary and caching the result for 150 * future calls. 151 * 152 * @param fullPageName the full name of the page, including any namespace. 153 * @param toChurn whether or not to bypass the cache and fetch the RDF straight 154 * from the wiki. If true, any previously cached file will be ignored and (if 155 * the fetch succeeds) overrwritten. 156 */ 157 public String readRDF_JSON( final String fullPageName, final boolean toChurn ) 158 throws IOException 159 { 160 final BufferedReader in = new BufferedReader( new InputStreamReader( 161 openRDF_JSON(fullPageName,toChurn), "UTF-8" )); 162 try { return ReaderX.appendTo( new StringBuilder(), in ).toString(); } 163 finally{ in.close(); } 164 } 165 166 167 168//// P r i v a t e /////////////////////////////////////////////////////////////////////// 169 170 171 private final Churn churn( final boolean fromScratch ) throws IOException 172 { 173 final String rclimit; 174 final String rcendQueryComponent; // specifying the earliest to churn 175 if( fromScratch ) 176 { 177 rclimit = "1"; // just enough to set tsLatestChurned (below) 178 rcendQueryComponent = ""; // no need, fetching just one 179 } 180 else 181 { 182 rclimit = "500"; // per query, max for ordinary user 183 Churn lastChurn = Churn.readObject( WikiCache.this ); 184 if( lastChurn == null ) 185 { 186 LoggerX.i(getClass()).config( "lost churn history, cleaning out entire cache" ); 187 if( !FileX.deleteRecursive( WikiCache.this )) throw new IOException( "unable to delete directory (please delete it manually): " + WikiCache.this ); 188 189 return init_create(); 190 } 191 192 rcendQueryComponent = "&rcend=" + lastChurn.tsLatestChurned; // oldest, listed new to old 193 // Churning at the last timestamp of the previous churn. If any pages at 194 // that timestamp are actually present in the cache, they will be churned 195 // again, perhaps redundantly. This is needed to ensure no change slips 196 // through in a burst of multiple changes that all have the same timestamp. 197 } 198 199 long msChurnBefore = Long.MAX_VALUE; 200 // reset below, will prevent repeat churning of multiply changed/listed pages 201 String tsLatestChurned = null; // so far 202 queryChain: for( String queryContinuation = "";; ) 203 { 204 // Vetting is not yet implemented in churns. No revisions are excluded, so 205 // the the server is unshielded from abusive edits. We might implement 206 // vetting based on a cooling-off period (P). This would ignore recent 207 // revisions (now - P), allowing time for abuse to be detected and corrected. 208 // This would require that each query overlap the previous by at least P, in 209 // order that any rejected revisions were again reconsidered for churning. 210 // Bypass of P might be allowed for sysop changes, user changes in user pages, 211 // leader changes in polls, and so forth. Deliberate churns (single page 212 // reconstructions by user request) would have to abide by complementary 213 // rules. All of this would be somewhat restricted by Semantic MediaWiki's 214 // limitations. RDF export applies only to the current page revision, so only 215 // that one can ever be accepted for churning. Even if an earlier revision 216 // had cooled for P, if the current had not, then neither could be accepted 217 // for churning. 218 219 final Spool spool = new Spool1(); 220 try 221 { 222 final HttpURLConnection http; 223 try 224 { 225 // http://www.mediawiki.org/wiki/API:Query_-_Lists#recentchanges_.2F_rc 226 final URI s = voteServer.pollwiki().scriptURI(); 227 final URI queryURI = new URI( 228 s.getScheme(), s.getAuthority(), s.getPath() + "/api.php", 229 /*query*/"action=query&list=recentchanges" 230 + rcendQueryComponent 231 + "&rclimit=" + rclimit + "&rcprop=title|timestamp&rctype=edit&format=xml" 232 + queryContinuation, 233 /*fragment*/null ); 234 LoggerX.i(getClass()).fine( "querying wiki " + queryURI ); 235 http = (HttpURLConnection)( queryURI.toURL().openConnection() ); 236 } 237 catch( URISyntaxException x ) { throw new RuntimeException( x ); } 238 239 URLConnectionX.connect( http ); 240 spool.add( new Hold() 241 { 242 public void release() { http.disconnect(); } 243 }); 244 245 final InputStream in = http.getInputStream(); 246 spool.add( new Hold() 247 { 248 public void release() { try{ in.close(); } catch( Exception x ) { throw VotorolaRuntimeException.castOrWrapped( x ); }} 249 }); 250 251 final XMLStreamReader r = MediaWiki.newXMLStreamReader( in, spool ); 252 if( msChurnBefore == Long.MAX_VALUE ) msChurnBefore = System.currentTimeMillis(); 253 // After having detected the most recent change, so no gap in which 254 // stale files might get cached and retained. 255 256 queryContinuation = null; 257 while( r.hasNext() ) 258 { 259 r.next(); 260 if( r.isStartElement() && "rc".equals( r.getLocalName() )) 261 { 262 ensure( r.getAttributeValue(/*ns*/null,"title"), msChurnBefore, 263 /*allowNewFile*/false ); 264 if( tsLatestChurned == null ) // then this is the last one (first in list) 265 { 266 tsLatestChurned = r.getAttributeValue( /*ns*/null, "timestamp" ); 267 } 268 } 269 else if( !fromScratch && r.isStartElement() 270 && "recentchanges".equals( r.getLocalName() )) 271 { 272 final String rcstart = r.getAttributeValue( /*ns*/null, "rcstart" ); 273 if( rcstart != null ) // also serves to gaurd against clobbering. Up to two elements are expected with this same name, only one of which has the sought for attribute. 274 { 275 queryContinuation = "&rcstart=" + rcstart; 276 } 277 } 278 else if( r.isStartElement() && "error".equals( r.getLocalName() )) 279 { 280 throw new MediaWiki.APIError( r ); 281 } 282 } 283 if( queryContinuation == null ) break queryChain; 284 } 285 catch( XMLStreamException x ) { throw new IOException( x ); } 286 finally{ spool.unwind(); } 287 } 288 289 if( tsLatestChurned == null ) // then it must be a brand new wiki, with no changes 290 { 291 final SimpleDateFormat iso8601Formatter = 292 // new SimpleDateFormat( SimpleDateFormatX.ISO_8601_PATTERN_C ); 293 /// but MediaWiki cannot parse 2010-05-02T18:08:01-0400, so use GMT and 'Z' suffix 294 new SimpleDateFormat( SimpleDateFormatX.ISO_8601_LOCAL_PATTERN + "'Z'" ); 295 iso8601Formatter.setTimeZone( TimeZone.getTimeZone( "GMT" )); 296 297 tsLatestChurned = iso8601Formatter.format( new Date( 298 System.currentTimeMillis() - // back far enough to cover clock mis-sync between hosts 299 1000/*ms per s*/ * 3600/*s per hour*/ * 24/*hours per day*/ * 7/*days*/ )); 300 } 301 302 final Churn churn = new Churn( tsLatestChurned ); 303 churn.writeObject( WikiCache.this ); 304 return churn; 305 } 306 307 308 309 private final File churnSerialFile = new File( WikiCache.this, Churn.SERIAL_FILE_NAME ); 310 311 312 313 /** @param msChurnBefore the earliest modtime acceptable without churning. If the 314 * cached file is older, it will be churned. 315 * @param allowNewFile whether to allow the addition of a new file, or only churning 316 * of the existing one. 317 * @return the corresponding file from the cache; or null if allowNewFile is false, 318 * and the file did not already exist in the cache. 319 */ 320 private File ensure( final String fullPageName, final long msChurnBefore, 321 final boolean allowNewFile ) throws IOException 322 { 323 // Currently this caches JSON file of the entire RDF export, which is huge. In 324 // future we'll support a soft-scripted transform of the result to a minimal JSON 325 // (or perhaps any format) that will then be cached and returned. 326 327 final File cacheFile = newCacheFile( fullPageName ); 328 if( !cacheFile.exists() ) 329 { 330 if( !allowNewFile ) return null; 331 332 FileX.traverse( cacheFile, // create parent directories and make them writable by all 333 new FileFilter() // up 334 { 335 public boolean accept( final File f ) { return !f.exists(); } 336 }, 337 new FileFilter() // down 338 { 339 public boolean accept( final File f ) 340 { 341 if( f != cacheFile ) 342 { 343 f.mkdir(); 344 f.setWritable( true, /*ownerOnly*/false ); 345 } 346 347 return true; 348 } 349 }); 350 } 351 if( !cacheFile.exists() || cacheFile.lastModified() < msChurnBefore ) 352 { 353 final Model data = ModelFactory.createDefaultModel(); 354 final Spool spool = new Spool1(); 355 try 356 { 357 final InputStream in = new BufferedInputStream( newRDFImportStream( fullPageName, 358 spool )); 359 spool.add( new Hold() 360 { 361 public void release() { try{ in.close(); } catch( Exception x ) { throw VotorolaRuntimeException.castOrWrapped( x ); }} 362 }); 363 data.read( in, /*base, not needed, no relative URLs*/null ); 364 } 365 finally{ spool.unwind(); } 366 final File tmpFile = File.createTempFile( cacheFile.getName(), ".json" ); 367 try 368 { 369 final Query q = QueryFactory.create( "SELECT * WHERE { ?s ?p ?o }" ); // http://tech.groups.yahoo.com/group/jena-dev/message/23035 370 final QueryExecution qexec = QueryExecutionFactory.create( q, data ); 371 final ResultSet rs = qexec.execSelect(); 372 final BufferedOutputStream out = new BufferedOutputStream( 373 new FileOutputStream( tmpFile )); 374 try 375 { 376 ResultSetFormatter.outputAsJSON( out, rs ); // appears to output UTF-8 377 } 378 finally{ out.close(); } 379 tmpFile.setWritable( true, /*ownerOnly*/false ); 380 // cacheFile.delete(); // non-atomic with rename, but rename alone should work: 381 FileX.renameFromDefaultToMv( tmpFile, cacheFile ); 382 } 383 finally{ if( tmpFile.isFile() ) tmpFile.delete(); } // clean up from exception 384 } 385 return cacheFile; 386 } 387 388 389 390 private File newCacheFile( final String fullPageName ) 391 { 392 final MatchResult m = MediaWiki.parsePageName( fullPageName ); 393 if( m == null ) throw new VotorolaRuntimeException( "malformed page name: " + fullPageName ); 394 395 String namespace = m.group( 1 ); 396 if( namespace == null ) namespace = "Main"; 397 398 final String pageName = m.group( 2 ); 399 return new File( WikiCache.this, namespace + File.separator + 400 (File.separatorChar == '/'? pageName: pageName.replaceAll("/",File.separator)) + ".json" ); 401 } 402 403 404 405 /** @param spool the spool for closing resources. 406 * @return the stream from which to read the RDF. Close it when you are finished 407 * with it. 408 */ 409 private InputStream newRDFImportStream( final String fullPageName, final Spool spool ) 410 throws IOException 411 { 412 final UriBuilder ub = UriBuilder.fromUri( voteServer.pollwiki().scriptURI() ); 413 ub.path( "index.php" ); 414 // ub.queryParam( "page", fullPageName ); 415 /// ignored by Semantic MediaWiki 1.7.1, it instead serves query form. so append as subpage: 416 ub.queryParam( "title", "Special:ExportRDF/" + fullPageName ); 417 ub.queryParam( "backlinks", "0" ); 418 ub.queryParam( "recursive", "0" ); 419 final URI uri = ub.build(); 420 LoggerX.i(getClass()).fine( "querying wiki " + uri ); 421 final HttpURLConnection http = (HttpURLConnection)( uri.toURL().openConnection() ); 422 423 URLConnectionX.connect( http ); 424 spool.add( new Hold() 425 { 426 public void release() { http.disconnect(); } 427 }); 428 return http.getInputStream(); 429 } 430 431 432 433 private final VoteServer voteServer; 434 435 436 437 // ==================================================================================== 438 439 440 private static final class Churn implements Serializable 441 { 442 443 private static final long serialVersionUID = 0L; 444 445 446 private Churn( String _tsLatestChurned ) 447 { 448 if( _tsLatestChurned == null ) throw new NullPointerException(); // fail fast 449 450 tsLatestChurned = _tsLatestChurned; 451 } 452 453 454 // ```````````````````````````````````````````````````````````````````````````````` 455 456 457 static Churn readObject( final WikiCache wikiCache ) throws IOException 458 { 459 final File serialFile = wikiCache.churnSerialFile; 460 if( !serialFile.isFile() ) return null; 461 462 try 463 { 464 return (Churn)FileX.readObject( serialFile ); 465 } 466 catch( ClassNotFoundException x ) { throw new RuntimeException( x ); } 467 } 468 469 470 final void writeObject( final WikiCache wikiCache ) throws IOException 471 { 472 final File serialFile = wikiCache.churnSerialFile; 473 FileX.writeObject( Churn.this, serialFile); 474 serialFile.setWritable( true, /*ownerOnly*/false ); 475 } 476 477 478 // -------------------------------------------------------------------------------- 479 480 481 private static final String SERIAL_FILE_NAME = "lastChurn.serial"; 482 483 484 /** The timestamp of the latest page revision in this churn. 485 */ 486 private final String tsLatestChurned; 487 488 489 } 490 491 492 493}