conversionfunctions.js, Increase number of invisible chars detected when converting to HTML/XML
This commit is contained in:
parent
3745eae9f9
commit
a3ef25cd17
|
@ -903,6 +903,142 @@ function convertCharStr2XML ( str, parameters ) {
|
|||
str = str.replace(/</g, '<')
|
||||
str = str.replace(/>/g, '>')
|
||||
|
||||
// replace invisible and ambiguous characters
|
||||
var nongraphic = new Set([
|
||||
'\u00AD', // shy
|
||||
|
||||
'\u061C', // alm
|
||||
'\u070F', // sam
|
||||
'\u08E2', // end of ayah
|
||||
'\u180E', // mvs
|
||||
'\u200B', // zwsp
|
||||
'\u200C', // zwnj
|
||||
'\u200D', // zwj
|
||||
'\u200E', // lrm
|
||||
'\u200F', // rlm
|
||||
'\u202A', // lre
|
||||
'\u202B', // rle
|
||||
'\u202D', // lro
|
||||
'\u202E', // rlo
|
||||
'\u202C', // pdf
|
||||
'\u2060', // wjoiner
|
||||
'\u2061', // func appln
|
||||
'\u2062', // inv x
|
||||
'\u2063', // inv sep
|
||||
'\u2064', // inv +
|
||||
'\u2066', // lri
|
||||
'\u2067', // rli
|
||||
'\u2068', // fsi
|
||||
'\u2069', // pdi
|
||||
'\u206A', // iss
|
||||
'\u206B', // ass
|
||||
'\u206C', // iafs
|
||||
'\u206D', // aafs
|
||||
'\u206E', // nads
|
||||
'\u206F', // nods
|
||||
'\uFFF9', // iaa
|
||||
'\uFFFA', // ias
|
||||
'\uFFFB', // iat
|
||||
|
||||
'\u13430', // vert join
|
||||
'\u13431', // horiz join
|
||||
'\u13432', // ins top start
|
||||
'\u13433', // ins bottom start
|
||||
'\u13434', // ins top end
|
||||
'\u13435', // ins bottom end
|
||||
'\u13436', // overlay mid
|
||||
'\u13437', // beg seg
|
||||
'\u13438', // end seg
|
||||
|
||||
'\u1BCA0', // sh let overlap
|
||||
'\u1BCA1', // sh cont overlap
|
||||
'\u1BCA2', // sh format down
|
||||
'\u1BCA3', // sh format up
|
||||
|
||||
'\u1D173', // mus beg beam
|
||||
'\u1D174', // mus end beam
|
||||
'\u1D175', // mus beg tie
|
||||
'\u1D176', // mus end tie
|
||||
'\u1D177', // mus beg slur
|
||||
'\u1D178', // mus end slur
|
||||
'\u1D179', // mus beg phrase
|
||||
'\u1D17A', // mus end phrase
|
||||
|
||||
'\u2000', // en quad
|
||||
'\u2001', // em quad
|
||||
'\u2002', // en space
|
||||
'\u2003', // em space
|
||||
'\u2004', // 3 per em space
|
||||
'\u2005', // 4 per em space
|
||||
'\u2006', // 6 per em space
|
||||
'\u2007', // figure space
|
||||
'\u2008', // punctuation space
|
||||
'\u2009', // thin space
|
||||
'\u200A', // hair space
|
||||
'\u205F', // mmsp
|
||||
'\u00A0', // nbsp
|
||||
'\u3000', // ideographic sp
|
||||
'\u202F', // nnbsp
|
||||
|
||||
'\u180B', // mfvs1
|
||||
'\u180C', // mfvs2
|
||||
'\u180D', // mfvs3
|
||||
|
||||
'\u2028', // line sep
|
||||
|
||||
'\u0000', // null
|
||||
])
|
||||
|
||||
if (parameters.match(/convertinvisibles/)) {
|
||||
newstring = ''
|
||||
for (let i=0;i<str.length;i++) {
|
||||
if (str.codePointAt(i)===0x09 || str.codePointAt(i)===0x0A || str.codePointAt(i)===0x0D) newstring += str[i]
|
||||
else if (str.codePointAt(i)<32 || (str.codePointAt(i)>126 && str.codePointAt(i)<160) || str.codePointAt(i)>0xE0000) {
|
||||
hex = str.codePointAt(i).toString(16).toUpperCase()
|
||||
while (hex.length < 4) hex = '0'+hex
|
||||
newstring += '&#x'+hex+';'
|
||||
}
|
||||
else if (nongraphic.has(str[i])) {
|
||||
hex = str.codePointAt(i).toString(16).toUpperCase()
|
||||
while (hex.length < 4) hex = '0'+hex
|
||||
newstring += '&#x'+hex+';'
|
||||
}
|
||||
else newstring += str[i]
|
||||
}
|
||||
str = newstring
|
||||
}
|
||||
|
||||
// convert lre/rle/pdf/rli/lri/fsi/pdi to markup
|
||||
if (parameters.match(/bidimarkup/)) {
|
||||
str = str.replace(/\u2066|⁦/g, '<span dir="ltr">') // lri
|
||||
str = str.replace(/\u2067|⁧/g, '<span dir="rtl">') // rli
|
||||
str = str.replace(/\u2068|⁨/g, '<span dir="auto">') // fsi
|
||||
str = str.replace(/\u2069|⁩/g, '</span>') // pdi
|
||||
|
||||
str = str.replace(/\u202A|‪/g, '<span dir="ltr">') // lre
|
||||
str = str.replace(/\u202B|‫/g, '<span dir="rtl">') // rle
|
||||
str = str.replace(/\u202C|‬/g, '</span>') // pdf
|
||||
|
||||
//str = str.replace(/\u202D/g, '<bdo dir="ltr">')
|
||||
//str = str.replace(/\u202E/g, '<bdo dir="rtl">')
|
||||
}
|
||||
|
||||
return str;
|
||||
}
|
||||
|
||||
|
||||
function convertCharStr2XMLOLD ( str, parameters ) {
|
||||
// replaces xml/html syntax-sensitive characters in a string with entities
|
||||
// also replaces invisible and ambiguous characters with escapes (list to be extended)
|
||||
// str: string, the input string
|
||||
// parameters: string, list of enum[convertinvisibles, bidimarkup]
|
||||
// (convertinvisibles) invisible characters are converted to NCRs
|
||||
// (bidimarkup) bidi rle/lre/pdf/rli/lri/fsi/pdi characters are converted to markup
|
||||
str = str.replace(/&/g, '&')
|
||||
str = str.replace(/"/g, '"')
|
||||
str = str.replace(/</g, '<')
|
||||
str = str.replace(/>/g, '>')
|
||||
|
||||
// replace invisible and ambiguous characters
|
||||
if (parameters.match(/convertinvisibles/)) {
|
||||
str = str.replace(/\u2066/g, '⁦') // lri
|
||||
|
@ -931,7 +1067,7 @@ function convertCharStr2XML ( str, parameters ) {
|
|||
str = str.replace(/\u200A/g, ' ') // hair space
|
||||
str = str.replace(/\u200B/g, '​') // zwsp
|
||||
str = str.replace(/\u205F/g, ' ') // mmsp
|
||||
str = str.replace(/\uA0/g, ' ') // nbsp
|
||||
str = str.replace(/\u00A0/g, ' ') // nbsp
|
||||
str = str.replace(/\u3000/g, ' ') // ideographic sp
|
||||
str = str.replace(/\u202F/g, ' ') // nnbsp
|
||||
|
||||
|
@ -942,7 +1078,11 @@ function convertCharStr2XML ( str, parameters ) {
|
|||
str = str.replace(/\u200C/g, '‌') // zwnj
|
||||
str = str.replace(/\u200D/g, '‍') // zwj
|
||||
str = str.replace(/\u2028/g, '
') // line sep
|
||||
str = str.replace(/\u206A/g, '') // iss
|
||||
str = str.replace(/\u00AD/g, '­') // shy
|
||||
str = str.replace(/\u2060/g, '⁠') // wjoiner
|
||||
|
||||
str = str.replace(/\u0000/g, '�') // null
|
||||
str = str.replace(/\u206A/g, '') // iss
|
||||
str = str.replace(/\u206B/g, '') // ass
|
||||
str = str.replace(/\u206C/g, '') // iafs
|
||||
str = str.replace(/\u206D/g, '') // aafs
|
||||
|
|
Loading…
Reference in New Issue