561 lines
16 KiB
561 lines
16 KiB
3 months ago
|
<?php
|
||
|
#
|
||
|
# SmartyPants - Smart typography for web sites
|
||
|
#
|
||
|
# PHP SmartyPants
|
||
|
# Copyright (c) 2004-2016 Michel Fortin
|
||
|
# <https://michelf.ca/>
|
||
|
#
|
||
|
# Original SmartyPants
|
||
|
# Copyright (c) 2003-2004 John Gruber
|
||
|
# <https://daringfireball.net/>
|
||
|
#
|
||
|
namespace Michelf;
|
||
|
|
||
|
|
||
|
#
|
||
|
# SmartyPants Parser Class
|
||
|
#
|
||
|
|
||
|
class SmartyPants {
|
||
|
|
||
|
### Version ###
|
||
|
|
||
|
const SMARTYPANTSLIB_VERSION = "1.8.1";
|
||
|
|
||
|
|
||
|
### Presets
|
||
|
|
||
|
# SmartyPants does nothing at all
|
||
|
const ATTR_DO_NOTHING = 0;
|
||
|
# "--" for em-dashes; no en-dash support
|
||
|
const ATTR_EM_DASH = 1;
|
||
|
# "---" for em-dashes; "--" for en-dashes
|
||
|
const ATTR_LONG_EM_DASH_SHORT_EN = 2;
|
||
|
# "--" for em-dashes; "---" for en-dashes
|
||
|
const ATTR_SHORT_EM_DASH_LONG_EN = 3;
|
||
|
# "--" for em-dashes; "---" for en-dashes
|
||
|
const ATTR_STUPEFY = -1;
|
||
|
|
||
|
# The default preset: ATTR_EM_DASH
|
||
|
const ATTR_DEFAULT = SmartyPants::ATTR_EM_DASH;
|
||
|
|
||
|
|
||
|
### Standard Function Interface ###
|
||
|
|
||
|
public static function defaultTransform($text, $attr = SmartyPants::ATTR_DEFAULT) {
|
||
|
#
|
||
|
# Initialize the parser and return the result of its transform method.
|
||
|
# This will work fine for derived classes too.
|
||
|
#
|
||
|
# Take parser class on which this function was called.
|
||
|
$parser_class = \get_called_class();
|
||
|
|
||
|
# try to take parser from the static parser list
|
||
|
static $parser_list;
|
||
|
$parser =& $parser_list[$parser_class][$attr];
|
||
|
|
||
|
# create the parser if not already set
|
||
|
if (!$parser)
|
||
|
$parser = new $parser_class($attr);
|
||
|
|
||
|
# Transform text using parser.
|
||
|
return $parser->transform($text);
|
||
|
}
|
||
|
|
||
|
|
||
|
### Configuration Variables ###
|
||
|
|
||
|
# Partial regex for matching tags to skip
|
||
|
public $tags_to_skip = 'pre|code|kbd|script|style|math';
|
||
|
|
||
|
# Options to specify which transformations to make:
|
||
|
public $do_nothing = 0; # disable all transforms
|
||
|
public $do_quotes = 0;
|
||
|
public $do_backticks = 0; # 1 => double only, 2 => double & single
|
||
|
public $do_dashes = 0; # 1, 2, or 3 for the three modes described above
|
||
|
public $do_ellipses = 0;
|
||
|
public $do_stupefy = 0;
|
||
|
public $convert_quot = 0; # should we translate " entities into normal quotes?
|
||
|
|
||
|
# Smart quote characters:
|
||
|
# Opening and closing smart double-quotes.
|
||
|
public $smart_doublequote_open = '“';
|
||
|
public $smart_doublequote_close = '”';
|
||
|
public $smart_singlequote_open = '‘';
|
||
|
public $smart_singlequote_close = '’'; # Also apostrophe.
|
||
|
|
||
|
# ``Backtick quotes''
|
||
|
public $backtick_doublequote_open = '“'; // replacement for ``
|
||
|
public $backtick_doublequote_close = '”'; // replacement for ''
|
||
|
public $backtick_singlequote_open = '‘'; // replacement for `
|
||
|
public $backtick_singlequote_close = '’'; // replacement for ' (also apostrophe)
|
||
|
|
||
|
# Other punctuation
|
||
|
public $em_dash = '—';
|
||
|
public $en_dash = '–';
|
||
|
public $ellipsis = '…';
|
||
|
|
||
|
### Parser Implementation ###
|
||
|
|
||
|
public function __construct($attr = SmartyPants::ATTR_DEFAULT) {
|
||
|
#
|
||
|
# Initialize a parser with certain attributes.
|
||
|
#
|
||
|
# Parser attributes:
|
||
|
# 0 : do nothing
|
||
|
# 1 : set all
|
||
|
# 2 : set all, using old school en- and em- dash shortcuts
|
||
|
# 3 : set all, using inverted old school en and em- dash shortcuts
|
||
|
#
|
||
|
# q : quotes
|
||
|
# b : backtick quotes (``double'' only)
|
||
|
# B : backtick quotes (``double'' and `single')
|
||
|
# d : dashes
|
||
|
# D : old school dashes
|
||
|
# i : inverted old school dashes
|
||
|
# e : ellipses
|
||
|
# w : convert " entities to " for Dreamweaver users
|
||
|
#
|
||
|
if ($attr == "0") {
|
||
|
$this->do_nothing = 1;
|
||
|
}
|
||
|
else if ($attr == "1") {
|
||
|
# Do everything, turn all options on.
|
||
|
$this->do_quotes = 1;
|
||
|
$this->do_backticks = 1;
|
||
|
$this->do_dashes = 1;
|
||
|
$this->do_ellipses = 1;
|
||
|
}
|
||
|
else if ($attr == "2") {
|
||
|
# Do everything, turn all options on, use old school dash shorthand.
|
||
|
$this->do_quotes = 1;
|
||
|
$this->do_backticks = 1;
|
||
|
$this->do_dashes = 2;
|
||
|
$this->do_ellipses = 1;
|
||
|
}
|
||
|
else if ($attr == "3") {
|
||
|
# Do everything, turn all options on, use inverted old school dash shorthand.
|
||
|
$this->do_quotes = 1;
|
||
|
$this->do_backticks = 1;
|
||
|
$this->do_dashes = 3;
|
||
|
$this->do_ellipses = 1;
|
||
|
}
|
||
|
else if ($attr == "-1") {
|
||
|
# Special "stupefy" mode.
|
||
|
$this->do_stupefy = 1;
|
||
|
}
|
||
|
else {
|
||
|
$chars = preg_split('//', $attr);
|
||
|
foreach ($chars as $c){
|
||
|
if ($c == "q") { $this->do_quotes = 1; }
|
||
|
else if ($c == "b") { $this->do_backticks = 1; }
|
||
|
else if ($c == "B") { $this->do_backticks = 2; }
|
||
|
else if ($c == "d") { $this->do_dashes = 1; }
|
||
|
else if ($c == "D") { $this->do_dashes = 2; }
|
||
|
else if ($c == "i") { $this->do_dashes = 3; }
|
||
|
else if ($c == "e") { $this->do_ellipses = 1; }
|
||
|
else if ($c == "w") { $this->convert_quot = 1; }
|
||
|
else {
|
||
|
# Unknown attribute option, ignore.
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
public function transform($text) {
|
||
|
|
||
|
if ($this->do_nothing) {
|
||
|
return $text;
|
||
|
}
|
||
|
|
||
|
$tokens = $this->tokenizeHTML($text);
|
||
|
$result = '';
|
||
|
$in_pre = 0; # Keep track of when we're inside <pre> or <code> tags.
|
||
|
|
||
|
$prev_token_last_char = ""; # This is a cheat, used to get some context
|
||
|
# for one-character tokens that consist of
|
||
|
# just a quote char. What we do is remember
|
||
|
# the last character of the previous text
|
||
|
# token, to use as context to curl single-
|
||
|
# character quote tokens correctly.
|
||
|
|
||
|
foreach ($tokens as $cur_token) {
|
||
|
if ($cur_token[0] == "tag") {
|
||
|
# Don't mess with quotes inside tags.
|
||
|
$result .= $cur_token[1];
|
||
|
if (preg_match('@<(/?)(?:'.$this->tags_to_skip.')[\s>]@', $cur_token[1], $matches)) {
|
||
|
$in_pre = isset($matches[1]) && $matches[1] == '/' ? 0 : 1;
|
||
|
}
|
||
|
} else {
|
||
|
$t = $cur_token[1];
|
||
|
$last_char = substr($t, -1); # Remember last char of this token before processing.
|
||
|
if (! $in_pre) {
|
||
|
$t = $this->educate($t, $prev_token_last_char);
|
||
|
}
|
||
|
$prev_token_last_char = $last_char;
|
||
|
$result .= $t;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return $result;
|
||
|
}
|
||
|
|
||
|
|
||
|
function decodeEntitiesInConfiguration() {
|
||
|
#
|
||
|
# Utility function that converts entities in configuration variables to
|
||
|
# UTF-8 characters.
|
||
|
#
|
||
|
$output_config_vars = array(
|
||
|
'smart_doublequote_open',
|
||
|
'smart_doublequote_close',
|
||
|
'smart_singlequote_open',
|
||
|
'smart_singlequote_close',
|
||
|
'backtick_doublequote_open',
|
||
|
'backtick_doublequote_close',
|
||
|
'backtick_singlequote_open',
|
||
|
'backtick_singlequote_close',
|
||
|
'em_dash',
|
||
|
'en_dash',
|
||
|
'ellipsis',
|
||
|
);
|
||
|
foreach ($output_config_vars as $var) {
|
||
|
$this->$var = html_entity_decode($this->$var);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
protected function educate($t, $prev_token_last_char) {
|
||
|
$t = $this->processEscapes($t);
|
||
|
|
||
|
if ($this->convert_quot) {
|
||
|
$t = preg_replace('/"/', '"', $t);
|
||
|
}
|
||
|
|
||
|
if ($this->do_dashes) {
|
||
|
if ($this->do_dashes == 1) $t = $this->educateDashes($t);
|
||
|
if ($this->do_dashes == 2) $t = $this->educateDashesOldSchool($t);
|
||
|
if ($this->do_dashes == 3) $t = $this->educateDashesOldSchoolInverted($t);
|
||
|
}
|
||
|
|
||
|
if ($this->do_ellipses) $t = $this->educateEllipses($t);
|
||
|
|
||
|
# Note: backticks need to be processed before quotes.
|
||
|
if ($this->do_backticks) {
|
||
|
$t = $this->educateBackticks($t);
|
||
|
if ($this->do_backticks == 2) $t = $this->educateSingleBackticks($t);
|
||
|
}
|
||
|
|
||
|
if ($this->do_quotes) {
|
||
|
if ($t == "'") {
|
||
|
# Special case: single-character ' token
|
||
|
if (preg_match('/\S/', $prev_token_last_char)) {
|
||
|
$t = $this->smart_singlequote_close;
|
||
|
}
|
||
|
else {
|
||
|
$t = $this->smart_singlequote_open;
|
||
|
}
|
||
|
}
|
||
|
else if ($t == '"') {
|
||
|
# Special case: single-character " token
|
||
|
if (preg_match('/\S/', $prev_token_last_char)) {
|
||
|
$t = $this->smart_doublequote_close;
|
||
|
}
|
||
|
else {
|
||
|
$t = $this->smart_doublequote_open;
|
||
|
}
|
||
|
}
|
||
|
else {
|
||
|
# Normal case:
|
||
|
$t = $this->educateQuotes($t);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if ($this->do_stupefy) $t = $this->stupefyEntities($t);
|
||
|
|
||
|
return $t;
|
||
|
}
|
||
|
|
||
|
|
||
|
protected function educateQuotes($_) {
|
||
|
#
|
||
|
# Parameter: String.
|
||
|
#
|
||
|
# Returns: The string, with "educated" curly quote HTML entities.
|
||
|
#
|
||
|
# Example input: "Isn't this fun?"
|
||
|
# Example output: “Isn’t this fun?”
|
||
|
#
|
||
|
$dq_open = $this->smart_doublequote_open;
|
||
|
$dq_close = $this->smart_doublequote_close;
|
||
|
$sq_open = $this->smart_singlequote_open;
|
||
|
$sq_close = $this->smart_singlequote_close;
|
||
|
|
||
|
# Make our own "punctuation" character class, because the POSIX-style
|
||
|
# [:PUNCT:] is only available in Perl 5.6 or later:
|
||
|
$punct_class = "[!\"#\\$\\%'()*+,-.\\/:;<=>?\\@\\[\\\\\]\\^_`{|}~]";
|
||
|
|
||
|
# Special case if the very first character is a quote
|
||
|
# followed by punctuation at a non-word-break. Close the quotes by brute force:
|
||
|
$_ = preg_replace(
|
||
|
array("/^'(?=$punct_class\\B)/", "/^\"(?=$punct_class\\B)/"),
|
||
|
array($sq_close, $dq_close), $_);
|
||
|
|
||
|
# Special case for double sets of quotes, e.g.:
|
||
|
# <p>He said, "'Quoted' words in a larger quote."</p>
|
||
|
$_ = preg_replace(
|
||
|
array("/\"'(?=\w)/", "/'\"(?=\w)/"),
|
||
|
array($dq_open.$sq_open, $sq_open.$dq_open), $_);
|
||
|
|
||
|
# Special case for decade abbreviations (the '80s):
|
||
|
$_ = preg_replace("/'(?=\\d{2}s)/", $sq_close, $_);
|
||
|
|
||
|
$close_class = '[^\ \t\r\n\[\{\(\-]';
|
||
|
$dec_dashes = '&\#8211;|&\#8212;';
|
||
|
|
||
|
# Get most opening single quotes:
|
||
|
$_ = preg_replace("{
|
||
|
(
|
||
|
\\s | # a whitespace char, or
|
||
|
| # a non-breaking space entity, or
|
||
|
-- | # dashes, or
|
||
|
&[mn]dash; | # named dash entities
|
||
|
$dec_dashes | # or decimal entities
|
||
|
&\\#x201[34]; # or hex
|
||
|
)
|
||
|
' # the quote
|
||
|
(?=\\w) # followed by a word character
|
||
|
}x", '\1'.$sq_open, $_);
|
||
|
# Single closing quotes:
|
||
|
$_ = preg_replace("{
|
||
|
($close_class)?
|
||
|
'
|
||
|
(?(1)| # If $1 captured, then do nothing;
|
||
|
(?=\\s | s\\b) # otherwise, positive lookahead for a whitespace
|
||
|
) # char or an 's' at a word ending position. This
|
||
|
# is a special case to handle something like:
|
||
|
# \"<i>Custer</i>'s Last Stand.\"
|
||
|
}xi", '\1'.$sq_close, $_);
|
||
|
|
||
|
# Any remaining single quotes should be opening ones:
|
||
|
$_ = str_replace("'", $sq_open, $_);
|
||
|
|
||
|
|
||
|
# Get most opening double quotes:
|
||
|
$_ = preg_replace("{
|
||
|
(
|
||
|
\\s | # a whitespace char, or
|
||
|
| # a non-breaking space entity, or
|
||
|
-- | # dashes, or
|
||
|
&[mn]dash; | # named dash entities
|
||
|
$dec_dashes | # or decimal entities
|
||
|
&\\#x201[34]; # or hex
|
||
|
)
|
||
|
\" # the quote
|
||
|
(?=\\w) # followed by a word character
|
||
|
}x", '\1'.$dq_open, $_);
|
||
|
|
||
|
# Double closing quotes:
|
||
|
$_ = preg_replace("{
|
||
|
($close_class)?
|
||
|
\"
|
||
|
(?(1)|(?=\\s)) # If $1 captured, then do nothing;
|
||
|
# if not, then make sure the next char is whitespace.
|
||
|
}x", '\1'.$dq_close, $_);
|
||
|
|
||
|
# Any remaining quotes should be opening ones.
|
||
|
$_ = str_replace('"', $dq_open, $_);
|
||
|
|
||
|
return $_;
|
||
|
}
|
||
|
|
||
|
|
||
|
protected function educateBackticks($_) {
|
||
|
#
|
||
|
# Parameter: String.
|
||
|
# Returns: The string, with ``backticks'' -style double quotes
|
||
|
# translated into HTML curly quote entities.
|
||
|
#
|
||
|
# Example input: ``Isn't this fun?''
|
||
|
# Example output: “Isn't this fun?”
|
||
|
#
|
||
|
|
||
|
$_ = str_replace(array("``", "''",),
|
||
|
array($this->backtick_doublequote_open,
|
||
|
$this->backtick_doublequote_close), $_);
|
||
|
return $_;
|
||
|
}
|
||
|
|
||
|
|
||
|
protected function educateSingleBackticks($_) {
|
||
|
#
|
||
|
# Parameter: String.
|
||
|
# Returns: The string, with `backticks' -style single quotes
|
||
|
# translated into HTML curly quote entities.
|
||
|
#
|
||
|
# Example input: `Isn't this fun?'
|
||
|
# Example output: ‘Isn’t this fun?’
|
||
|
#
|
||
|
|
||
|
$_ = str_replace(array("`", "'",),
|
||
|
array($this->backtick_singlequote_open,
|
||
|
$this->backtick_singlequote_close), $_);
|
||
|
return $_;
|
||
|
}
|
||
|
|
||
|
|
||
|
protected function educateDashes($_) {
|
||
|
#
|
||
|
# Parameter: String.
|
||
|
#
|
||
|
# Returns: The string, with each instance of "--" translated to
|
||
|
# an em-dash HTML entity.
|
||
|
#
|
||
|
|
||
|
$_ = str_replace('--', $this->em_dash, $_);
|
||
|
return $_;
|
||
|
}
|
||
|
|
||
|
|
||
|
protected function educateDashesOldSchool($_) {
|
||
|
#
|
||
|
# Parameter: String.
|
||
|
#
|
||
|
# Returns: The string, with each instance of "--" translated to
|
||
|
# an en-dash HTML entity, and each "---" translated to
|
||
|
# an em-dash HTML entity.
|
||
|
#
|
||
|
|
||
|
# em en
|
||
|
$_ = str_replace(array("---", "--",),
|
||
|
array($this->em_dash, $this->en_dash), $_);
|
||
|
return $_;
|
||
|
}
|
||
|
|
||
|
|
||
|
protected function educateDashesOldSchoolInverted($_) {
|
||
|
#
|
||
|
# Parameter: String.
|
||
|
#
|
||
|
# Returns: The string, with each instance of "--" translated to
|
||
|
# an em-dash HTML entity, and each "---" translated to
|
||
|
# an en-dash HTML entity. Two reasons why: First, unlike the
|
||
|
# en- and em-dash syntax supported by
|
||
|
# EducateDashesOldSchool(), it's compatible with existing
|
||
|
# entries written before SmartyPants 1.1, back when "--" was
|
||
|
# only used for em-dashes. Second, em-dashes are more
|
||
|
# common than en-dashes, and so it sort of makes sense that
|
||
|
# the shortcut should be shorter to type. (Thanks to Aaron
|
||
|
# Swartz for the idea.)
|
||
|
#
|
||
|
|
||
|
# en em
|
||
|
$_ = str_replace(array("---", "--",),
|
||
|
array($this->en_dash, $this->em_dash), $_);
|
||
|
return $_;
|
||
|
}
|
||
|
|
||
|
|
||
|
protected function educateEllipses($_) {
|
||
|
#
|
||
|
# Parameter: String.
|
||
|
# Returns: The string, with each instance of "..." translated to
|
||
|
# an ellipsis HTML entity. Also converts the case where
|
||
|
# there are spaces between the dots.
|
||
|
#
|
||
|
# Example input: Huh...?
|
||
|
# Example output: Huh…?
|
||
|
#
|
||
|
|
||
|
$_ = str_replace(array("...", ". . .",), $this->ellipsis, $_);
|
||
|
return $_;
|
||
|
}
|
||
|
|
||
|
|
||
|
protected function stupefyEntities($_) {
|
||
|
#
|
||
|
# Parameter: String.
|
||
|
# Returns: The string, with each SmartyPants HTML entity translated to
|
||
|
# its ASCII counterpart.
|
||
|
#
|
||
|
# Example input: “Hello — world.”
|
||
|
# Example output: "Hello -- world."
|
||
|
#
|
||
|
|
||
|
# en-dash em-dash
|
||
|
$_ = str_replace(array('–', '—'),
|
||
|
array('-', '--'), $_);
|
||
|
|
||
|
# single quote open close
|
||
|
$_ = str_replace(array('‘', '’'), "'", $_);
|
||
|
|
||
|
# double quote open close
|
||
|
$_ = str_replace(array('“', '”'), '"', $_);
|
||
|
|
||
|
$_ = str_replace('…', '...', $_); # ellipsis
|
||
|
|
||
|
return $_;
|
||
|
}
|
||
|
|
||
|
|
||
|
protected function processEscapes($_) {
|
||
|
#
|
||
|
# Parameter: String.
|
||
|
# Returns: The string, with after processing the following backslash
|
||
|
# escape sequences. This is useful if you want to force a "dumb"
|
||
|
# quote or other character to appear.
|
||
|
#
|
||
|
# Escape Value
|
||
|
# ------ -----
|
||
|
# \\ \
|
||
|
# \" "
|
||
|
# \' '
|
||
|
# \. .
|
||
|
# \- -
|
||
|
# \` `
|
||
|
#
|
||
|
$_ = str_replace(
|
||
|
array('\\\\', '\"', "\'", '\.', '\-', '\`'),
|
||
|
array('\', '"', ''', '.', '-', '`'), $_);
|
||
|
|
||
|
return $_;
|
||
|
}
|
||
|
|
||
|
|
||
|
protected function tokenizeHTML($str) {
|
||
|
#
|
||
|
# Parameter: String containing HTML markup.
|
||
|
# Returns: An array of the tokens comprising the input
|
||
|
# string. Each token is either a tag (possibly with nested,
|
||
|
# tags contained therein, such as <a href="<MTFoo>">, or a
|
||
|
# run of text between tags. Each element of the array is a
|
||
|
# two-element array; the first is either 'tag' or 'text';
|
||
|
# the second is the actual value.
|
||
|
#
|
||
|
#
|
||
|
# Regular expression derived from the _tokenize() subroutine in
|
||
|
# Brad Choate's MTRegex plugin.
|
||
|
# <http://www.bradchoate.com/past/mtregex.php>
|
||
|
#
|
||
|
$index = 0;
|
||
|
$tokens = array();
|
||
|
|
||
|
$match = '(?s:<!--.*?-->)|'. # comment
|
||
|
'(?s:<\?.*?\?>)|'. # processing instruction
|
||
|
# regular tags
|
||
|
'(?:<[/!$]?[-a-zA-Z0-9:]+\b(?>[^"\'>]+|"[^"]*"|\'[^\']*\')*>)';
|
||
|
|
||
|
$parts = preg_split("{($match)}", $str, -1, PREG_SPLIT_DELIM_CAPTURE);
|
||
|
|
||
|
foreach ($parts as $part) {
|
||
|
if (++$index % 2 && $part != '')
|
||
|
$tokens[] = array('text', $part);
|
||
|
else
|
||
|
$tokens[] = array('tag', $part);
|
||
|
}
|
||
|
return $tokens;
|
||
|
}
|
||
|
|
||
|
}
|