[RULES]
URL=(?i:https?|ftps?|nfs|sshfs|gopher|smb)://[\p{L}\p{N}]+(?:[[:punct:]]+[\p{L}\p{N}]+)+
URL-WWW=www\.[\p{L}\p{N}]+(?:[[:punct:]]+[\p{L}\p{N}]+)+

E-MAIL=^[\p{L}\p{N}\._%+\-]+@[\p{L}\p{N}\.\-]+\.\p{L}{2,4}$



#Ex: (dis)information
WORD-PARPREFIX=(?:\p{Ps}\p{L}+[\p{Pc}\p{Pd}]?\p{Pe}[\p{Pc}\p{Pd}]?)\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)*

#Ex: understand(s)
WORD-PARSUFFIX=\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)*(?:[\p{Pc}\p{Pd}]?\p{Ps}[\p{Pc}\p{Pd}]?\p{L}+\p{Pe})

#Keep dash/underscore connected parts (even if they are in parenthesis)
WORD-COMPOUND=\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)+

#Abbreviations with multiple periods
ABBREVIATION=\p{L}{1,3}(?:\.\p{L}{1,3})+\.?

#retain initials
INITIAL=^(?:\p{Lt}|\p{Lu})\.$

#Homogeneous punctuation (ellipsis etc)
PUNCTUATION-MULTI=(?:\.|\-|[!\?]){2,}

#Date
DATE=\p{N}{1,2}-\p{N}{1,2}-\p{N}{2,4}
DATE-REVERSE=\p{N}{4}-\p{N}{1,2}-\p{N}{1,2}

NUMBER-YEAR=('\p{N}{2})(?:\P{N}|\z)
#NUMBER-YEAR=('\p{N}{2})\P{N}

#Times
TIME=\p{N}{1,2}:\p{N}{1,2}(?::\p{N})?(?i:am|pm)?

#retain digits, including those starting with initial period (.22), and negative numbers
NUMBER=-?(?:[\.,]?\p{N}+)+

CURRENCY=\p{Sc}

WORD=[\p{L}\p{Mn}]+

PUNCTUATION=\p{P}

UNKNOWN=.

[PREFIXES]
l['`]
d['`]
m['`]
t['`]
s['`]
c['`]
dell['`]
dall['`]
nell['`]
all['`]
sull['`]
quell['`]
quest['`]
e['`]
un['`]
senz['`]
com['`]
cos['`]
anch['`]
dev['`]

[SUFFIXES]

[ORDINALS]
o
a

[TOKENS]
l['`]
d['`]
m['`]
t['`]
s['`]
c['`]
dell['`]
dall['`]
nell['`]
all['`]
sull['`]
quell['`]
quest['`]
e['`]
un['`]
senz['`]
com['`]
cos['`]
anch['`]
dev['`]

[UNITS]
km
m
cm
mm
g
kg
C
l
s
sec
min
gb
mb
kb


[CURRENCY]
EUR

[ABBREVIATIONS]


[EOSMARKERS]
# Character: !
# Name: EXCLAMATION MARK
# Code: 33 (0x21) 
\u0033

# Character: ?
# Name: QUESTION MARK
# Code: 63 (0x3f) 
\u0063

# Character: ;
# Name: GREEK QUESTION MARK
# Code: 894 (0x37e) 
\u0894

# Character: ؟
# Name: ARABIC QUESTION MARK
# Code: 1567 (0x61f) 
\u1567

# Character: 。
# Name: IDEOGRAPHIC FULL STOP
# Code: 12290 (0x3002) 
\u12290

# Character: ｡
# Name: HALFWIDTH IDEOGRAPHIC FULL STOP
# Code: 65377 (0xff61) 
\u65377

# Character: ？
# Name: FULLWIDTH QUESTION MARK
# Code: 65311 (0xff1f) 
\u65311

# Character: ！
# Name: FULLWIDTH EXCLAMATION MARK
# Code: 65281 (0xff01) 
\u65281

# Character: ।
# Name: DEVANAGARI DANDA
# Code: 2404 (0x964) 
\u2404

# Character: ։
# Name: ARMENIAN FULL STOP
# Code: 1417 (0x589) 
\u1417

# Character: ՞
# Name: ARMENIAN QUESTION MARK
# Code: 1374 (0x55e) 
\u1374

# Character: ።
# Name: ETHIOPIC FULL STOP
# Code: 4962 (0x1362) 
\u4962

# Character: ᙮
# Name: CANADIAN SYLLABICS FULL STOP
# Code: 5742 (0x166e) 
\u5742

# Character: ។
# Name: KHMER SIGN KHAN
# Code: 6100 (0x17d4) 
\u6100

# Character: ៕
# Name: KHMER SIGN BARIYOOSAN
# Code: 6101 (0x17d5) 
\u6101

# Character: ᠃
# Name: MONGOLIAN FULL STOP
# Code: 6147 (0x1803) 
\u6147

# Character: ᠉
# Name: MONGOLIAN MANCHU FULL STOP
# Code: 6153 (0x1809) 
\u6153
