using System; using System.IO; using System.Collections; using System.Text.RegularExpressions; namespace pdfdb { class Class1 { [STAThread] static void Main(string[] args) { // We use regular expressions to match URLs to the names of the Producers Hashtable urls = new Hashtable(); Hashtable urlregexps = new Hashtable(); Console.WriteLine("Load in the producer regexps"); StreamReader re = new StreamReader(args[0] + "/producers.txt"); try { while( re.Peek() != -1 ) { string line = re.ReadLine(); string[] tokens = Regex.Split(line, ":[ \t]+"); urlregexps[tokens[0]] = tokens[1]; } } catch(Exception e) { Console.WriteLine("Could not read producer regexps"); } // We store the great big list of things we've found here string all = ""; SortedList producer = new SortedList(); SortedList tagged = new SortedList(); SortedList encrypted = new SortedList(); SortedList linearized = new SortedList(); SortedList version = new SortedList(); SortedList pdfpage = new SortedList(); SortedList pdfmeta = new SortedList(); SortedList pagecount = new SortedList(); SortedList plparse = new SortedList(); SortedList pdfomatic = new SortedList(); SortedList pdfa = new SortedList(); int docs = 0; int exceptions = 0; Console.WriteLine("Find all the PDFs available in {0}", args[0]); string[] dirs = Directory.GetDirectories(args[0]); int i; for(i = 0; i < dirs.Length; i++) { // NB. This would also match a string of the form: 000000/foo/bar, // which isn't really intended Match m = Regex.Match(dirs[i], "[0-9][0-9][0-9][0-9][0-9][0-9]"); if( m.Success ) { Console.Write("."); StreamReader input = new StreamReader(args[0] + "/" + m.ToString() + "/data.info"); docs++; all += ";" + m.ToString(); try { while( input.Peek() != -1 ) { string line = input.ReadLine(); string[] tokens = Regex.Split(line, ":[ \t]+"); if( tokens.Length > 1 ) { switch(tokens[0]) { case "Producer": producer[tokens[1]] += ";" + m.ToString(); break; case "Tagged": tagged[tokens[1]] += ";" + m.ToString(); break; case "Encrypted": encrypted[tokens[1]] += ";" + m.ToString(); break; case "Optimized": linearized[tokens[1]] += ";" + m.ToString(); break; case "PDF version": version[tokens[1]] += ";" + m.ToString(); break; case "Pages": // Page counts are broken into classes of hundreds, with the 0-100 class // further broken up into classes of size 10 int pc = Int32.Parse(tokens[1]); int lower = 0; if( pc < 100 ) lower = ((pc - pc % 10) / 10) * 10; else lower = ((pc - pc % 100) / 100) * 100; pagecount[lower] += ";" + m.ToString() + "," + tokens[1]; break; default: pdfmeta[m.ToString()] += "" + tokens[0] + ": " + tokens[1] + "
"; break; } } else { Console.WriteLine("{1}: Unknown line: {0}", line, m.ToString()); } } } catch(Exception e) { Console.WriteLine("{0}: Caught exception processing this document", m.ToString()); exceptions++; } // We also need to check the status of the latest PandaLex parsing regression test input = new StreamReader(args[0] + "/" + m.ToString() + "/data.pandalex"); string mostRecentStatus = ""; string previousStatus = ""; bool statusNext = true; try { while( input.Peek() != -1 ) { string line = input.ReadLine(); if( statusNext ) { previousStatus = mostRecentStatus; mostRecentStatus = line; } Match plm = Regex.Match(line, "--------------------------"); statusNext = plm.Success; } } catch(Exception e) { Console.WriteLine("{0}: Caught exception processing pandalex regression: {1}", m.ToString(), e.ToString()); exceptions++; } // Now do something with the status we found try { string[] results = Regex.Split(mostRecentStatus, "\t"); string change = results[1]; string oldvalue = change; try { string[] oldresults = Regex.Split(previousStatus, "\t"); oldvalue = oldresults[1]; } catch(Exception e) { // Do nothing } if(change != oldvalue) change += " (new)"; plparse[change] += ";" + m.ToString(); } catch(Exception e) { } // We also need to check the status of the latest pdfomatic parsing regression test input = new StreamReader(args[0] + "/" + m.ToString() + "/data.pdfomatic"); mostRecentStatus = ""; previousStatus = ""; try { while( input.Peek() != -1 ) { string line = input.ReadLine(); previousStatus = mostRecentStatus; mostRecentStatus = line; } } catch(Exception e) { Console.WriteLine("{0}: Caught exception processing pdfomatic regression: {1}", m.ToString(), e.ToString()); exceptions++; } if(mostRecentStatus != previousStatus) mostRecentStatus += " (new)"; pdfomatic[mostRecentStatus] += ";" + m.ToString(); // Ditto PDF/A input = new StreamReader(args[0] + "/" + m.ToString() + "/data.pdfa"); mostRecentStatus = ""; previousStatus = ""; try { while( input.Peek() != -1 ) { string line = input.ReadLine(); previousStatus = mostRecentStatus; mostRecentStatus = line; } } catch(Exception e) { Console.WriteLine("{0}: Caught exception processing pdfa regression: {1}", m.ToString(), e.ToString()); exceptions++; } if(mostRecentStatus != previousStatus) mostRecentStatus += " (new)"; pdfa[mostRecentStatus] += ";" + m.ToString(); // End of PDF file processing } } Console.WriteLine(""); // Dump out the indices list Console.WriteLine("Writing the main index page"); StreamWriter output = new StreamWriter(args[0] + "/db.html"); output.WriteLine("Mikal's PDF database"); output.WriteLine("Stillhq.com PDF Database: Index

"); output.WriteLine("
"); output.WriteLine("All the PDF documents in this database are sorted in a variety of ways. "); output.WriteLine("Click on a number to see a page about the PDF with that id number."); output.WriteLine("There are currently {0} PDF documents in the database. ", docs); output.WriteLine("

Update: Please note that the name of each producer is now a link back to "); output.WriteLine("their respective websites. I have also introduced a frequently asked questions page. "); output.WriteLine("This FAQ page also discusses how to mount the PDF database as a drive on your windows machine, which is a very convenient way of downloading the entire database."); output.WriteLine("

Please send email to mikal@stillhq.com "); output.WriteLine("if you would like to donate sample PDFs. "); output.WriteLine("


"); output.WriteLine("Navigation: Producer "); output.WriteLine("Tagging "); output.WriteLine("Encryption "); output.WriteLine("Linearization "); output.WriteLine("Version "); output.WriteLine("Length "); output.WriteLine("PandaLex "); output.WriteLine("PdfOMatic "); output.WriteLine("PDF/A compliance "); output.WriteLine("All "); output.WriteLine("

"); int ht; for(ht = 0; ht < 9; ht++) { SortedList target = producer; string catname = ""; switch(ht) { case 0: target = producer; catname = "Producer"; output.WriteLine("
By producer
"); break; case 1: target = tagged; catname = "Tagged"; output.WriteLine("
By tagging
"); break; case 2: target = encrypted; catname = "Encrypted"; output.WriteLine("
By encryption
"); break; case 3: target = linearized; catname = "Linearized"; output.WriteLine("
By linearization
"); break; case 4: target = version; catname = "Specification verion"; output.WriteLine("
By PDF specification version
"); break; case 5: target = pagecount; catname = "Document length"; output.WriteLine("
By document length
"); break; case 6: target = plparse; catname = "PandaLex parse results"; output.WriteLine("
By PandaLex parse result
"); break; case 7: target = pdfomatic; catname = "PdfOMatic parse results"; output.WriteLine("
By PdfOMatic parse result
"); break; case 8: target = pdfa; catname = "PDF/A compliance"; output.WriteLine("
By PDF/A compliance
"); break; } Console.Write("{0}: ", catname); output.WriteLine(""); Console.WriteLine(""); } // And finally list all documents output.WriteLine("
All documents
"); output.WriteLine(""); output.WriteLine("




PDF database administered by "); output.WriteLine("mikal@stillhq.com
"); output.WriteLine("Database Copyright (c) Michael Still 2003, 2004, 2005. PDFs Copyright their various authors."); output.Close(); // Now write out each of the individual pages Console.WriteLine(""); Console.Write("Generating PDF pages: "); foreach(object key in pdfpage.Keys) { Console.Write("."); output = new StreamWriter(args[0] + "/" + key.ToString() + "/info.html"); output.WriteLine("Mikal's PDF database: PDF Number {0}", key.ToString()); output.WriteLine("Stillhq.com "); output.WriteLine("PDF Database : {0}

", key.ToString()); // The header has been pregenerated output.WriteLine("
Database indices
Metadata
Ghostscript errors
Notes
Download
Thumbnails
"); output.WriteLine("


PDF database administered by "); output.WriteLine("mikal@stillhq.com
"); output.WriteLine("Database Copyright (c) Michael Still 2003. PDFs Copyright their various authors."); output.Close(); output.Close(); } Console.WriteLine(""); Console.WriteLine("Exceptions caught: {0}", exceptions); } } }