module Hailo::Parse
Direct including types
Defined in:
hailo/parse.cr
Constant Summary
ABBREV = / #{ ALPHABET } (?:\. #{ ALPHABET } )++\./
ADDRESS = /:/
ALPHABET = /(?:[\p{L}\p{M}])/
APOST_WORD = / #{ ALPHABET } ++(?: #{ APOSTROPHE } #{ ALPHABET } ++)++/
APOSTROPHE = /['’´]/
BARE_WORD = / #{ WORD_CHAR } ++/
BOUNDARY = / #{ CLOSE_QUOTE } ?(?:\s* #{ TERMINATOR } | #{ ADDRESS } )\s+ #{ OPEN_QUOTE } ?\s*/
CLOSE_QUOTE = /['"’“”«»」』›‘]/
CLOSE_TAG = /<\/(?:-| #{ WORD_CHAR } )+>/
CURRENCY = /[¤¥¢£\$]/
DASH = /[–-]/
DATE = /[0-9]{4}-[Ww]?[0-9]{1,2}-[0-9]{1,2}/
DATETIME = / #{ DATE } [Tt] #{ TIME } /
DOTTED = / #{ BARE_WORD } ?\. #{ BARE_WORD } (?:\. #{ BARE_WORD } )*+/
DOTTED_STRICT = / #{ LOOSE_WORD } (?: #{ POINT } (?:\d+| #{ WORD_CHAR } {2,}))?/
ELLIPSIS = /(?:\.{2,}|…)/
EMAIL = /[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+(?:\.[A-Za-z]{2,4})*/
ESC_SPACE = /(?:\\ )+/
FILENAME = /(?: #{ NAME } )?\. #{ NAME } (?:\. #{ NAME } )*| #{ NAME } /
HOST_WORD = / #{ BARE_WORD } (?:-+ #{ BARE_WORD } )*/
HOSTNAME = / #{ HOST_WORD } (?:\. #{ HOST_WORD } )*/
IPV4 = /[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}/
IPV6 = /(?:(?:[0-9A-Fa-f]{0,4})?:){1,7}[0-9A-Fa-f]{0,4}/
IRC_CHAN = /[#&+][^ \a\0\012\015,:]{1,199}/
IRC_NICK = /<(?: |[&~]?[@%+~&])?[A-Za-z_`\-^\|\\\{\}\[\]][A-Za-z_0-9`\-^\|\\\{}\[\]]+>/
LOOSE_WORD = / #{ IRC_CHAN } | #{ DATETIME } | #{ DATE } | #{ TIME } | #{ PATH } | #{ NUMBER } | #{ ABBREV } | #{ APOST_WORD } | #{ NUMERO } | #{ BARE_WORD } (?: #{ DASH } (?: #{ WORD_TYPES } | #{ BARE_WORD } )| #{ APOSTROPHE } (?! #{ ALPHABET } | #{ NUMBER } | #{ APOSTROPHE } )| #{ DASH } (?! #{ DASH } {2}))*/
NAME = /(?: #{ BARE_WORD } | #{ ESC_SPACE } )+/
NON_WORD = / #{ NWORD_CHAR } ++/
NONSPACE = /\S/
NUMBER = / #{ CURRENCY } ?+ #{ POINT } \d++(?: #{ POINT } \d++)*+(?: #{ CURRENCY } | #{ ALPHABET } ++)?+| #{ CURRENCY } ?+\d++(?: #{ POINT } \d++)*+(?: #{ CURRENCY } | #{ ALPHABET } ++)?+(?!\d| #{ ALPHABET } )/
NUMERO = /#[0-9]+/
NWORD_CHAR = /[^\s\p{L}\p{M}\d_]/
OPEN_QUOTE = /['"‘“„«»「『‹‚]/
OPT_LONG = /-- #{ OPT_PART } (?:- #{ OPT_PART } )*/
OPT_PART = /(?: #{ ALPHABET } |\d)(?: #{ ALPHABET } |[_\d])+/
OPT_SHORT = /-(?: #{ ALPHABET } |\d)(?! #{ ALPHABET } |\d)/
PAREN_TIME = /\( #{ TIME } \)/
PATH = / #{ UNIX_PATH } | #{ WIN_PATH } /
PERL_CLASS = /(?:::\w+(?:::\w+)*|\w+(?:::\w+)+)(?:::)?|\w+::/
POINT = /[.,]/
PORT = /:[0-9]+/
PUNCTUATION = /[?!‽,;.:]/
RX_APOST = {"'" => /[’´](?! #{ ALPHABET } | #{ NUMBER } )/ , "’" => /['´](?! #{ ALPHABET } | #{ NUMBER } )/ , "´" => /['’](?! #{ ALPHABET } | #{ NUMBER } )/ }
RX_APOSTROPHE = / #{ APOSTROPHE } /
RX_CAPITALIZE_FIRST = /^\s* #{ OPEN_QUOTE } ?\s*\K #{ SPLIT_WORD } (?= #{ ELLIPSIS } |(?:(?: #{ CLOSE_QUOTE } | #{ TERMINATOR } | #{ ADDRESS } | #{ PUNCTUATION } +)?(?:\s|$)))/
RX_CAPITALIZE_IM = /(?:(?: #{ ELLIPSIS } |\s+)| #{ OPEN_QUOTE } )\Ki(?= #{ APOSTROPHE } #{ ALPHABET } |\s| #{ PUNCTUATION } |$)/
RX_CAPITALIZE_REST_A = /(?: #{ ELLIPSIS } |\s+) #{ OPEN_QUOTE } ?\s* #{ WORD_STRICT } #{ BOUNDARY } \K #{ SPLIT_WORD } /
RX_CAPITALIZE_REST_B = / #{ SEPARATOR } #{ WORD_STRICT } #{ SEPARATOR } #{ BOUNDARY } \K #{ SPLIT_WORD } /
RX_CAPITALIZE_SECOND = /^ #{ SPLIT_WORD } (?:\s* #{ TERMINATOR } | #{ ADDRESS } )\s+\K #{ SPLIT_WORD } /
RX_DASH_NEWL = /( #{ DASH } )\s*\n+\s*/
RX_END_PARAGRAPH = /(?: #{ ELLIPSIS } |\s+|^) #{ OPEN_QUOTE } ?(?: #{ SPLIT_WORD } (?:\. #{ SPLIT_WORD } )*)\K( #{ CLOSE_QUOTE } ?)$/
RX_MIXED_CASE = /\p{Ll}+\p{Lu}|\p{Lu}{2,}\p{Ll}|(?:\p{Lu}+ #{ NWORD_CHAR } +)(?<!I')(?:\p{Lu}*\p{Ll})/
RX_NEWLINE = /\s*\n+\s*/
RX_NON_SPACE = /\S+/
RX_SPACE = /\s+/
RX_TOKEN_NORMAL = /(?P<word> #{ WORD } )|(?P<non_word> #{ NON_WORD } )/
RX_TOKEN_SPECIAL = /(?P<special> #{ SPECIAL_WORD } )| #{ RX_TOKEN_NORMAL } /
RX_WORD_APOST = / #{ APOSTROPHE } (?! #{ ALPHABET } | #{ NUMBER } )/
SEPARATOR = "\b"
SPACE = /\s/
SPECIAL_WORD = /(?> #{ URI } | #{ OPT_SHORT } | #{ OPT_LONG } | #{ CLOSE_TAG } | #{ IRC_NICK } | #{ IRC_CHAN } | #{ DATETIME } | #{ DATE } | #{ TIME } | #{ PAREN_TIME } | #{ SQUARE_TIME } | #{ PERL_CLASS } | #{ EMAIL } | #{ TWAT_NAME } | #{ PATH } | #{ NUMERO } )/
SPLIT_WORD = / #{ LOOSE_WORD } (?:\/ #{ LOOSE_WORD } )?(?= #{ PUNCTUATION } (?:\s+|$)| #{ CLOSE_QUOTE } | #{ TERMINATOR } |\s+|$)/
SQUARE_TIME = /\[ #{ TIME } \]/
TERMINATOR = /[?!‽]+|(?<!\.)\./
TIME = /[0-9]{1,2}:[0-9]{2}(?::[0-9]{2})?(?:[Zz]| ?(?:am|AM|pm|PM)|[-+±][0-9]{2}(?::?[0-9]{2})?)?/
TWAT_NAME = /@[A-Za-z0-9_]+/
UNIX_PATH = /\/ #{ FILENAME } (?:\/ #{ FILENAME } )*\/?/
URI = / #{ URI_SCHEME } (?: #{ HOSTNAME } | #{ IPV4 } | #{ IPV6 } ) #{ PORT } ? #{ URI_PATH } ?/
URI_PATH = /\/(?:(?: #{ ALPHABET } +| #{ URI_EXTRA1 } +)*(?: #{ ALPHABET } +| #{ URI_EXTRA2 } +))?/
URI_SCHEME = /(?: #{ HOST_WORD } \+)? #{ BARE_WORD } :\/\//
WIN_PATH = / #{ ALPHABET } :\\ #{ FILENAME } (?:\\ #{ FILENAME } )*\\?/
WORD = / #{ WORD_TYPES } (?:(?: #{ DASH } #{ WORD_TYPES } )++| #{ DASH } (?! #{ DASH } ))?+/
WORD_CHAR = / #{ ALPHABET } |[\d_]/
WORD_STRICT = / #{ DOTTED_STRICT } (?: #{ APOSTROPHE } #{ DOTTED_STRICT } )*/
WORD_TYPES = / #{ NUMBER } | #{ ABBREV } | #{ DOTTED } | #{ APOST_WORD } | #{ BARE_WORD } /