// Copyright (C) 2006 Google Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. /** * @fileoverview * some functions for browser-side pretty printing of code contained in html. * * The lexer should work on a number of languages including C and friends, * Java, Python, Bash, SQL, HTML, XML, CSS, Javascript, and Makefiles. * It works passably on Ruby, PHP and Awk and a decent subset of Perl, but, * because of commenting conventions, doesn't work on Smalltalk, Lisp-like, or * CAML-like languages. * * If there's a language not mentioned here, then I don't know it, and don't * know whether it works. If it has a C-like, Bash-like, or XML-like syntax * then it should work passably. * * Usage: * 1) include this source file in an html page via * * 2) define style rules. See the example page for examples. * 3) mark the
and tags in your source
* with class=prettyprint.
* You can also use the (html deprecated) tag, but the pretty printer
* needs to do more substantial DOM manipulations to support that, so some
* css styles may not be preserved.
* That's it. I wanted to keep the API as simple as possible, so there's no
* need to specify which language the code is in.
*
* Change log:
* cbeust, 2006/08/22
* Java annotations (start with "@") are now captured as literals ("lit")
*/
var PR_keywords = new Object();
/** initialize the keyword list for our target languages. */
(function () {
var CPP_KEYWORDS = (
"bool break case catch char class const const_cast continue default " +
"delete deprecated dllexport dllimport do double dynamic_cast else enum " +
"explicit extern false float for friend goto if inline int long mutable " +
"naked namespace new noinline noreturn nothrow novtable operator private " +
"property protected public register reinterpret_cast return selectany " +
"short signed sizeof static static_cast struct switch template this " +
"thread throw true try typedef typeid typename union unsigned using " +
"declaration, using directive uuid virtual void volatile while typeof");
var JAVA_KEYWORDS = (
"abstract default goto package synchronized boolean do if private this " +
"break double implements protected throw byte else import public throws " +
"case enum instanceof return transient catch extends int short try char " +
"final interface static void class finally long strictfp volatile const " +
"float native super while continue for new switch");
var PYTHON_KEYWORDS = (
"and assert break class continue def del elif else except exec finally " +
"for from global if import in is lambda not or pass print raise return " +
"try while yield False True None");
var JSCRIPT_KEYWORDS = (
"abstract boolean break byte case catch char class const continue " +
"debugger default delete do double else enum export extends false final " +
"finally float for function goto if implements import in instanceof int " +
"interface long native new null package private protected public return " +
"short static super switch synchronized this throw throws transient " +
"true try typeof var void volatile while with NaN Infinity");
var PERL_KEYWORDS = (
"foreach require sub unless until use elsif BEGIN END");
var SH_KEYWORDS = (
"if then do else fi end");
var RUBY_KEYWORDS = (
"if then elsif else end begin do rescue ensure while for class module " +
"def yield raise until unless and or not when case super undef break " +
"next redo retry in return alias defined");
var KEYWORDS = [CPP_KEYWORDS, JAVA_KEYWORDS, JSCRIPT_KEYWORDS, PERL_KEYWORDS,
PYTHON_KEYWORDS, RUBY_KEYWORDS, SH_KEYWORDS];
for (var k = 0; k < KEYWORDS.length; k++) {
var kw = KEYWORDS[k].split(' ');
for (var i = 0; i < kw.length; i++) {
if (kw[i]) { PR_keywords[kw[i]] = true; }
}
}
}).call(this);
// token style names. correspond to css classes
/** token style for a string literal */
var PR_STRING = 'str';
/** token style for a keyword */
var PR_KEYWORD = 'kwd';
/** token style for a comment */
var PR_COMMENT = 'com';
/** token style for a type */
var PR_TYPE = 'typ';
/** token style for a literal value. e.g. 1, null, true. */
var PR_LITERAL = 'lit';
/** token style for a punctuation string. */
var PR_PUNCTUATION = 'pun';
/** token style for a punctuation string. */
var PR_PLAIN = 'pln';
/** token style for an sgml tag. */
var PR_TAG = 'tag';
/** token style for a markup declaration such as a DOCTYPE. */
var PR_DECLARATION = 'dec';
/** token style for embedded source. */
var PR_SOURCE = 'src';
/** token style for an sgml attribute name. */
var PR_ATTRIB_NAME = 'atn';
/** token style for an sgml attribute value. */
var PR_ATTRIB_VALUE = 'atv';
/** the position of the end of a token during. A division of a string into
* n tokens can be represented as a series n - 1 token ends, as long as
* runs of whitespace warrant their own token.
* @private
*/
function PR_TokenEnd(end, style) {
if (undefined === style) { throw new Error('BAD'); }
if ('number' != typeof(end)) { throw new Error('BAD'); }
this.end = end;
this.style = style;
}
PR_TokenEnd.prototype.toString = function () {
return '[PR_TokenEnd ' + this.end +
(this.style ? ':' + this.style : '') + ']';
};
/** a chunk of text with a style. These are used to represent both the output
* from the lexing functions as well as intermediate results.
* @constructor
* @param token the token text
* @param style one of the token styles defined in designdoc-template, or null
* for a styleless token, such as an embedded html tag.
* @private
*/
function PR_Token(token, style) {
if (undefined === style) { throw new Error('BAD'); }
this.token = token;
this.style = style;
}
PR_Token.prototype.toString = function () {
return '[PR_Token ' + this.token + (this.style ? ':' + this.style : '') + ']';
};
/** a helper class that decodes common html entities used to escape source and
* markup punctuation characters in html.
* @constructor
* @private
*/
function PR_DecodeHelper() {
this.next = 0;
this.ch = '\0';
}
PR_DecodeHelper.prototype.decode = function (s, i) {
var next = i + 1;
var ch = s.charAt(i);
if ('&' == ch) {
var semi = s.indexOf(';', next);
if (semi >= 0 && semi < next + 4) {
var entityName = s.substring(next, semi).toLowerCase();
next = semi + 1;
if ('lt' == entityName) {
ch = '<';
} else if ('gt' == entityName) {
ch = '>';
} else if ('quot' == entityName) {
ch = '"';
} else if ('apos' == entityName) {
ch = '\'';
} else if ('amp' == entityName) {
ch = '&';
} else {
next = i + 1;
}
}
}
this.next = next;
this.ch = ch;
return this.ch;
}
// some string utilities
function PR_isWordChar(ch) {
return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
}
function PR_isIdentifierStart(ch) {
return PR_isWordChar(ch) || ch == '_' || ch == '$' || ch == '@';
}
function PR_isIdentifierPart(ch) {
return PR_isIdentifierStart(ch) || PR_isDigitChar(ch);
}
function PR_isSpaceChar(ch) {
return "\t \r\n".indexOf(ch) >= 0;
}
function PR_isDigitChar(ch) {
return ch >= '0' && ch <= '9';
}
function PR_trim(s) {
var i = 0, j = s.length - 1;
while (i <= j && PR_isSpaceChar(s.charAt(i))) { ++i; }
while (j > i && PR_isSpaceChar(s.charAt(j))) { --j; }
return s.substring(i, j + 1);
}
function PR_startsWith(s, prefix) {
return s.length >= prefix.length && prefix == s.substring(0, prefix.length);
}
function PR_endsWith(s, suffix) {
return s.length >= suffix.length &&
suffix == s.substring(s.length - suffix.length, s.length);
}
/** true iff prefix matches the first prefix characters in chars[0:len].
* @private
*/
function PR_prefixMatch(chars, len, prefix) {
if (len < prefix.length) { return false; }
for (var i = 0, n = prefix.length; i < n; ++i) {
if (prefix.charAt(i) != chars[i]) { return false; }
}
return true;
}
/** used to convert html special characters embedded in XMP tags into html. */
function PR_textToHtml(str) {
return str.replace(/&/g, '&').replace(//g, '>');
}
/** split markup into chunks of html tags (style null) and
* plain text (style {@link #PR_PLAIN}).
*
* @param s a String of html.
* @return an Array of PR_Tokens of style PR_PLAIN and null.
* @private
*/
function PR_chunkify(s) {
var chunks = new Array();
var state = 0;
var start = 0;
var pos = -1;
for (var i = 0, n = s.length; i < n; ++i) {
var ch = s.charAt(i);
switch (state) {
case 0:
if ('<' == ch) { state = 1; }
break;
case 1:
pos = i - 1;
if ('/' == ch) { state = 2; }
else if (PR_isWordChar(ch)) { state = 3; }
else if ('<' == ch) { state = 1; }
else { state = 0; }
break;
case 2:
if (PR_isWordChar(ch)) { state = 3; }
else if ('<' == ch) { state = 1; }
else { state = 0; }
break;
case 3:
if ('>' == ch) {
if (pos > start) {
chunks.push(new PR_Token(s.substring(start, pos), PR_PLAIN));
}
chunks.push(new PR_Token(s.substring(pos, i + 1), null));
start = i + 1;
pos = -1;
state = 0;
}
break;
}
}
if (s.length > start) {
chunks.push(new PR_Token(s.substring(start, s.length), PR_PLAIN));
}
return chunks;
}
/** splits chunks around entities.
* @private
*/
function PR_splitEntities(chunks) {
var chunksOut = new Array();
var state = 0;
for (var ci = 0, nc = chunks.length; ci < nc; ++ci) {
var chunk = chunks[ci];
if (PR_PLAIN != chunk.style) {
chunksOut.push(chunk);
continue;
}
var s = chunk.token;
var pos = 0;
var start;
for (var i = 0; i < s.length; ++i) {
var ch = s.charAt(i);
switch (state) {
case 0:
if ('&' == ch) { state = 1; }
break;
case 1:
if ('#' == ch || PR_isWordChar(ch)) {
start = i - 1;
state = 2;
} else {
state = 0;
}
break;
case 2:
if (';' == ch) {
if (start > pos) {
chunksOut.push(
new PR_Token(s.substring(pos, start), chunk.style));
}
chunksOut.push(new PR_Token(s.substring(start, i + 1), null));
pos = i + 1;
state = 0;
}
break;
}
}
if (s.length > pos) {
chunksOut.push(pos ?
new PR_Token(s.substring(pos, s.length), chunk.style) :
chunk);
}
}
return chunksOut;
}
/** walk the tokenEnds list and the chunk list in parallel to generate a list
* of split tokens.
* @private
*/
function PR_splitChunks(chunks, tokenEnds) {
var tokens = new Array(); // the output
var ci = 0; // index into chunks
// position of beginning of amount written so far in absolute space.
var posAbs = 0;
// position of amount written so far in chunk space
var posChunk = 0;
// current chunk
var chunk = new PR_Token('', null);
for (var ei = 0, ne = tokenEnds.length; ei < ne; ++ei) {
var tokenEnd = tokenEnds[ei];
var end = tokenEnd.end;
var tokLen = end - posAbs;
var remainingInChunk = chunk.token.length - posChunk;
// FIX Nicolas LASSALLE : remainingInChunk and tokLen can both be equals so that the while is infinite !
if (remainingInChunk != tokLen) {
while (remainingInChunk <= tokLen) {
if (remainingInChunk > 0) {
tokens.push(
new PR_Token(chunk.token.substring(posChunk, chunk.token.length),
null == chunk.style ? null : tokenEnd.style));
}
posAbs += remainingInChunk;
posChunk = 0;
if (ci < chunks.length) { chunk = chunks[ci++]; }
tokLen = end - posAbs;
remainingInChunk = chunk.token.length - posChunk;
}
}
if (tokLen) {
tokens.push(
new PR_Token(chunk.token.substring(posChunk, posChunk + tokLen),
tokenEnd.style));
posAbs += tokLen;
posChunk += tokLen;
}
}
return tokens;
}
/** splits markup tokens into declarations, tags, and source chunks.
* @private
*/
function PR_splitMarkup(chunks) {
// A state machine to split out declarations, tags, etc.
// This state machine deals with absolute space in the text, indexed by k,
// and position in the current chunk, indexed by pos and tokenStart to
// generate a list of the ends of tokens.
// Absolute space is calculated by considering the chunks as appended into
// one big string, as they were before being split.
// Known failure cases
// Server side scripting sections such as ...?> in attributes.
// i.e.
// Handling this would require a stack, and we don't use PHP.
// The output: a list of pairs of PR_TokenEnd instances
var tokenEnds = new Array();
var state = 0; // FSM state variable
var k = 0; // position in absolute space of the start of the current chunk
var tokenStart = -1; // the start of the current token
// Try to find a closing tag for any open