Classes | Macros | Enumerations | Functions
ENC: Codeset and header field handling

The functions in this group should be conformant to the following standards: ANSI X3.4, ISO 2022, ISO 8601, ISO 8859, ISO 10646, RFC 1468, RFC 2045, RFC 2046, RFC 2047, RFC 2049, RFC 2152, RFC 2183, RFC 2231, RFC 2646, RFC 3629, RFC 3676, RFC 5198, RFC 5536, RFC 6657, POSIX.1-1996, Unicode 14.0.0. More...

Classes

struct  enc_mime_ct
 MIME content type information. More...
 
struct  enc_mime_mpe
 Locations of MIME multipart entities. More...
 
struct  enc_wm_pattern
 Wildmat array element (for RFC 3977 wildmat-pattern) More...
 

Macros

#define MAIN_ERR_PREFIX   "ENC: "
 Message prefix for ENCODING module.
 
#define ENC_UC_NORM_DEBUG   0
 
#define ENC_MIME_PARA_LENGTH_MAX   (size_t) 127
 Maximum length of MIME parameter attribute tokens.
 
#define ENC_MIME_HEADER_FOLD_ASCII_LINES   1
 MIME word encoder folding behaviour. More...
 
#define ENC_UA   "\xEF\xBF\xBD" /* U+FFFD */
 
#define ENC_RC   0xFFFDL /* U+FFFD */
 
#define ENC_UC_DECOMPOSITION_BUFSIZE   (size_t) 16
 
#define ENC_HDR_BUFSIZE   (size_t) 998
 
#define ENC_FMT_BUFLEN   (size_t) 7
 
#define ENC_CTE_BUFLEN   (size_t) 32
 Buffer size for content transfer encoding name strings.
 
#define ENC_CS_BUFLEN   (size_t) 32
 Buffer size for character set name strings.
 
#define ENC_BO_BUFLEN   (size_t) 75
 Buffer size for multipart boundary strings. More...
 
#define ENC_DELIMITER
 Delimiter string to print between article header and body parts. More...
 

Enumerations

enum  iso2022_state { ISO2022_ASCII, ISO2022_ISO646, ISO2022_JIS_X_0208 }
 
enum  uc_hs_type {
  UC_HST_NONE, UC_HST_L, UC_HST_V, UC_HST_T,
  UC_HST_LV, UC_HST_LVT
}
 
enum  enc_mime_ct_type {
  ENC_CT_UNKNOWN, ENC_CT_TEXT, ENC_CT_IMAGE, ENC_CT_AUDIO,
  ENC_CT_VIDEO, ENC_CT_MULTIPART, ENC_CT_MESSAGE, ENC_CT_APPLICATION
}
 IDs for supported MIME content types. More...
 
enum  enc_mime_ct_subtype {
  ENC_CTS_UNKNOWN, ENC_CTS_PLAIN, ENC_CTS_MIXED, ENC_CTS_ALTERNATIVE,
  ENC_CTS_DIGEST, ENC_CTS_RFC822, ENC_CTS_OCTETSTREAM
}
 IDs for supported MIME content subtypes. More...
 
enum  enc_mime_cte {
  ENC_CTE_UNKNOWN, ENC_CTE_7BIT, ENC_CTE_8BIT, ENC_CTE_BIN,
  ENC_CTE_Q, ENC_CTE_B
}
 IDs for supported MIME content transfer encodings. More...
 
enum  enc_mime_cs {
  ENC_CS_UNKNOWN, ENC_CS_ASCII, ENC_CS_ISO8859_1, ENC_CS_ISO8859_2,
  ENC_CS_ISO8859_3, ENC_CS_ISO8859_4, ENC_CS_ISO8859_5, ENC_CS_ISO8859_6,
  ENC_CS_ISO8859_7, ENC_CS_ISO8859_8, ENC_CS_ISO8859_9, ENC_CS_ISO8859_10,
  ENC_CS_ISO8859_11, ENC_CS_ISO8859_13, ENC_CS_ISO8859_14, ENC_CS_ISO8859_15,
  ENC_CS_ISO8859_16, ENC_CS_ISO8859_X, ENC_CS_MACINTOSH, ENC_CS_KOI8R,
  ENC_CS_KOI8U, ENC_CS_WINDOWS_1250, ENC_CS_WINDOWS_1251, ENC_CS_WINDOWS_1252,
  ENC_CS_WINDOWS_1253, ENC_CS_WINDOWS_1254, ENC_CS_WINDOWS_1255, ENC_CS_WINDOWS_1256,
  ENC_CS_WINDOWS_1257, ENC_CS_WINDOWS_1258, ENC_CS_IBM437, ENC_CS_IBM775,
  ENC_CS_IBM850, ENC_CS_IBM852, ENC_CS_IBM858, ENC_CS_ISO2022_JP,
  ENC_CS_CESU_8, ENC_CS_UTF_7, ENC_CS_UTF_8, ENC_CS_UTF_16BE
}
 IDs for supported MIME character sets. More...
 
enum  enc_mime_cd { ENC_CD_UNKNOWN, ENC_CD_INLINE, ENC_CD_ATTACHMENT }
 IDs for supported MIME content disposition.
 
enum  enc_uri_scheme {
  ENC_URI_SCHEME_INVALID, ENC_URI_SCHEME_HTTP, ENC_URI_SCHEME_FTP, ENC_URI_SCHEME_NEWS,
  ENC_URI_SCHEME_MAILTO
}
 URI schemes. More...
 

Functions

void enc_uc_encode_utf8 (char *buf, size_t *i, long int *dbuf, size_t *di)
 Encode Unicode codepoints to UTF-8. More...
 
const char * enc_create_name_addr (const char *data, size_t offset)
 Create a "name-addr" construct according to RFC 5322. More...
 
unsigned long int enc_lines_decode (const char *lines)
 Decode number of lines. More...
 
void enc_convert_lines_to_string (char *l, unsigned long int l_raw)
 Convert number of lines to string. More...
 
core_time_t enc_timestamp_decode (const char *timestamp)
 Decode canonical timestamp to POSIX time (seconds since epoche) More...
 
int enc_convert_posix_to_iso8601 (char *isodate, core_time_t pts)
 Convert POSIX timestamp to ISO 8601 conformant local date and time. More...
 
int enc_get_iso8601_utc (char *isodate)
 Get current UTC date in ISO 8601 conformant format. More...
 
int enc_convert_iso8601_to_posix (core_time_t *pts, const char *isodate)
 Convert ISO 8601 conformant UTC date and time to POSIX timestamp. More...
 
int enc_convert_iso8601_to_timestamp (const char **ts, const char *isodate)
 Convert ISO 8601 conformant date to canonical timestamp. More...
 
int enc_convert_anum_to_ascii (char result[17], size_t *len, core_anum_t wm)
 Convert article number from numerical format to ASCII. More...
 
int enc_convert_ascii_to_anum (core_anum_t *result, const char *wm, int len)
 Convert number from ASCII to numerical format. More...
 
int enc_convert_octet_to_hex (char *result, unsigned int octet)
 Convert octet to hexadecimal (ASCII) format. More...
 
void enc_rot13 (char *data)
 Encode or decode data with ROT13 algorithm. More...
 
int enc_mime_encode_base64 (const char **enc, const char *data, size_t len)
 Encode binary data to base64. More...
 
const char * enc_extract_addr_spec (const char *mailbox)
 Extract addr-spec token from RFC 5322 mailbox. More...
 
int enc_ascii_check (const char *s)
 Verify ASCII encoding. More...
 
int enc_ascii_check_alpha (const char *s)
 Check for ASCII alphabetic characters. More...
 
int enc_ascii_check_digit (const char *s)
 Check for ASCII digit characters. More...
 
int enc_ascii_check_printable (const char *s)
 Check for printable ASCII characters. More...
 
void enc_ascii_convert_to_printable (char *s)
 Convert to printable ASCII format. More...
 
void enc_ascii_convert_distribution (char *s)
 Convert body of distribution header field. More...
 
int enc_uc_check_utf8 (const char *s)
 Verify UTF-8 encoding. More...
 
const char * enc_uc_repair_utf8 (const char *s)
 Repair UTF-8 encoding. More...
 
int enc_create_wildmat (struct enc_wm_pattern **obj, const char *wm)
 Create wildmat pattern array. More...
 
void enc_destroy_wildmat (struct enc_wm_pattern **obj, int num)
 Destroy wildmat pattern array. More...
 
const char * enc_convert_canonical_to_posix (const char *s, int rcr, int rlf)
 Convert from canonical (RFC 822) to local (POSIX) form. More...
 
const char * enc_convert_posix_to_canonical (const char *s)
 Convert from local (POSIX) to canonical (RFC 822) form. More...
 
const char * enc_convert_to_utf8_nfc (enum enc_mime_cs charset, const char *s)
 Convert string from supported character set to Unicode (UTF-8 NFC) More...
 
const char * enc_convert_to_8bit (enum enc_mime_cs *charset, const char *s, const char **cs_iana)
 Convert string from Unicode (UTF-8 NFC) to an 8bit character set. More...
 
int enc_mime_word_encode (const char **r, const char *b, size_t pl)
 Encode header field body using MIME encoded-word tokens. More...
 
int enc_mime_word_decode (const char **r, const char *b)
 Decode header field containing potential MIME encoded-word tokens. More...
 
int enc_mime_para_decode (const char **r, const char *b, int m)
 Decode header field containing potential MIME parameters. More...
 
void enc_mime_get_ct (struct enc_mime_ct *ct, const char *hf_body, char *bo)
 Decode MIME "Content-Type" header field. More...
 
enum enc_mime_cte enc_mime_get_cte (const char *hf_body)
 Decode content transfer encoding description. More...
 
void enc_mime_get_cd (const char *hf_body, enum enc_mime_cd *type, const char **filename)
 Decode content disposition. More...
 
int enc_mime_save_to_file (const char *pn, enum enc_mime_cte cte, const char *entity)
 Decode MIME content transfer encoding and save to file. More...
 
const char * enc_mime_decode (enum enc_mime_cte cte, enum enc_mime_cs charset, const char *s)
 Decode MIME text content to UTF-8 NFC. More...
 
const char * enc_mime_flowed_decode (const char *s, unsigned int delsp, unsigned int insline)
 Decode MIME "text/plain" content with "format=flowed" parameter. More...
 
size_t enc_mime_message (const char *s, size_t len, struct enc_mime_mpe **mpe)
 Extract MIME encapsulated message. More...
 
size_t enc_mime_multipart (const char *s, const char *b, struct enc_mime_mpe **mpe)
 Parse MIME multipart content. More...
 
int enc_percent_decode (char *s, int clean)
 Percent decoder. More...
 
const char * enc_uri_percent_encode (const char *s, enum enc_uri_scheme sch)
 Percent encoding for URI content. More...
 
int enc_uc_search (const char *s, size_t start_pos, const char *search_s, size_t *found_pos, size_t *found_len)
 
void enc_free (void *p)
 Free an object allocated by encoding module. More...
 

Content type flags according to RFC 3676

The flags can be bitwise ORed together.

Note
The parameter "InsLine" is experimental (not defined in RFC 3676).
#define ENC_CT_FLAG_FLOWED   0x01U
 
#define ENC_CT_FLAG_DELSP   0x02U
 
#define ENC_CT_FLAG_INSLINE   0x04U
 

Detailed Description

The functions in this group should be conformant to the following standards: ANSI X3.4, ISO 2022, ISO 8601, ISO 8859, ISO 10646, RFC 1468, RFC 2045, RFC 2046, RFC 2047, RFC 2049, RFC 2152, RFC 2183, RFC 2231, RFC 2646, RFC 3629, RFC 3676, RFC 5198, RFC 5536, RFC 6657, POSIX.1-1996, Unicode 14.0.0.

Todo:
We don't use iconv() because on old operating systems there may be no Unicode support. And even on such old machines we don't want an external dependency from GNU iconv.
There should be an option to use the systems iconv() on request.

Macro Definition Documentation

◆ ENC_BO_BUFLEN

#define ENC_BO_BUFLEN   (size_t) 75

Buffer size for multipart boundary strings.

RFC 2046 limits the length of the boundary delimiter to 70 characters. There are always two hyphens before the boundary delimiter. At the end of the multipart body, there are 2 hyphens after the boundary delimiter. Finally we need space for the NUL character to terminate the string. Result: 70 + 2 + 2 + 1 = 75

Definition at line 176 of file encoding.h.

◆ ENC_DELIMITER

#define ENC_DELIMITER
Value:
"________________________________________" \
"_______________________________________|" "\n" \
" " \
" |" "\n"

Delimiter string to print between article header and body parts.

Definition at line 179 of file encoding.h.

◆ ENC_MIME_HEADER_FOLD_ASCII_LINES

#define ENC_MIME_HEADER_FOLD_ASCII_LINES   1

MIME word encoder folding behaviour.

If this is defined to nonzero, all lines of RFC 2047 conformant header fields that contain MIME encoded words are folded before 76 characters. Otherwise all lines that contain no encoded-words are not folded before 998 characters.

RFC 2047 is ambigous regarding this rule:
https://tools.ietf.org/html/rfc2047#section-2
The default value 1 is safe in any case. Please read section 2, paragraph 5 carefully before redefining this to 0!

Definition at line 73 of file encoding.c.

Enumeration Type Documentation

◆ enc_mime_cs

IDs for supported MIME character sets.

Enumerator
ENC_CS_ASCII 

ANSI X3.4

ENC_CS_ISO8859_1 

ISO 8859-1

ENC_CS_ISO8859_2 

ISO 8859-2

ENC_CS_ISO8859_3 

ISO 8859-3

ENC_CS_ISO8859_4 

ISO 8859-4

ENC_CS_ISO8859_5 

ISO 8859-5

ENC_CS_ISO8859_6 

ISO 8859-6

ENC_CS_ISO8859_7 

ISO 8859-7

ENC_CS_ISO8859_8 

ISO 8859-8

ENC_CS_ISO8859_9 

ISO 8859-9

ENC_CS_ISO8859_10 

ISO 8859-10

ENC_CS_ISO8859_11 

ISO 8859-11

ENC_CS_ISO8859_13 

ISO 8859-13

ENC_CS_ISO8859_14 

ISO 8859-14

ENC_CS_ISO8859_15 

ISO 8859-15

ENC_CS_ISO8859_16 

ISO 8859-16

ENC_CS_ISO8859_X 

Generic fallback (handle only ASCII characters)

ENC_CS_MACINTOSH 

Mac Roman

ENC_CS_KOI8R 

Kod Obmena Informatsiey 8 bit (russion)

ENC_CS_KOI8U 

Kod Obmena Informatsiey 8 bit (ukrainian)

ENC_CS_WINDOWS_1250 

Windows codepage 1250

ENC_CS_WINDOWS_1251 

Windows codepage 1251

ENC_CS_WINDOWS_1252 

Windows codepage 1252

ENC_CS_WINDOWS_1253 

Windows codepage 1253

ENC_CS_WINDOWS_1254 

Windows codepage 1254

ENC_CS_WINDOWS_1255 

Windows codepage 1255

ENC_CS_WINDOWS_1256 

Windows codepage 1256

ENC_CS_WINDOWS_1257 

Windows codepage 1257

ENC_CS_WINDOWS_1258 

Windows codepage 1258

ENC_CS_IBM437 

IBM codepage 437

ENC_CS_IBM775 

IBM codepage 775

ENC_CS_IBM850 

IBM codepage 850

ENC_CS_IBM852 

IBM codepage 852

ENC_CS_IBM858 

IBM codepage 858

ENC_CS_ISO2022_JP 

ISO 2022-JP

ENC_CS_CESU_8 

Compatibility Encoding Scheme for UTF-16

ENC_CS_UTF_7 

UCS Transformation Format 7 bit

ENC_CS_UTF_8 

UCS Transformation Format 8 bit

ENC_CS_UTF_16BE 

UCS Transformation Format 16 bit big endian

Definition at line 59 of file encoding.h.

◆ enc_mime_ct_subtype

IDs for supported MIME content subtypes.

Enumerator
ENC_CTS_PLAIN 

Plain without enrichment

ENC_CTS_MIXED 

Independent parts with a particular order

ENC_CTS_ALTERNATIVE 

Different representations of same content

ENC_CTS_DIGEST 

Default media type message/rfc822

ENC_CTS_RFC822 

Encapulated message

ENC_CTS_OCTETSTREAM 

Raw octet stream

Definition at line 36 of file encoding.h.

◆ enc_mime_ct_type

IDs for supported MIME content types.

Enumerator
ENC_CT_TEXT 

Text

ENC_CT_IMAGE 

Picture

ENC_CT_AUDIO 

Audio

ENC_CT_VIDEO 

Video

ENC_CT_MULTIPART 

Content consists of multiple parts

ENC_CT_MESSAGE 

Content consists of an encapsulated message

ENC_CT_APPLICATION 

Content for unknown application

Definition at line 23 of file encoding.h.

◆ enc_mime_cte

IDs for supported MIME content transfer encodings.

Enumerator
ENC_CTE_7BIT 

ASCII text

ENC_CTE_8BIT 

Raw non ASCII text

ENC_CTE_BIN 

Arbitrary binary data

ENC_CTE_Q 

MIME quoted-printable

ENC_CTE_B 

MIME base64

Definition at line 48 of file encoding.h.

◆ enc_uri_scheme

URI schemes.

Enumerator
ENC_URI_SCHEME_HTTP 

Hyper Text Transfer Protocol

ENC_URI_SCHEME_FTP 

File Transfer Protocol

ENC_URI_SCHEME_NEWS 

News group or article

ENC_URI_SCHEME_MAILTO 

E-mail

Definition at line 129 of file encoding.h.

Function Documentation

◆ enc_ascii_check()

int enc_ascii_check ( const char *  s)

Verify ASCII encoding.

Parameters
[in]sString to verify
Returns
  • 0 on success
  • Negative value on error

Definition at line 4944 of file encoding.c.

Referenced by core_get_signature(), core_post_article(), and core_subscribe_group().

◆ enc_ascii_check_alpha()

int enc_ascii_check_alpha ( const char *  s)

Check for ASCII alphabetic characters.

Parameters
[in]sPointer to single character

Locale independent check based on ASCII.

Returns
  • 0 if s is an alphabetic character
  • Negative value if s is not an alphabetic character

Definition at line 4972 of file encoding.c.

Referenced by enc_ascii_convert_distribution().

◆ enc_ascii_check_digit()

int enc_ascii_check_digit ( const char *  s)

Check for ASCII digit characters.

Parameters
[in]sPointer to single character

Locale independent check based on ASCII.

Returns
  • 0 if s is a digit character
  • Negative value if s is not a digit character

Definition at line 4995 of file encoding.c.

Referenced by enc_ascii_convert_distribution(), and enc_convert_ascii_to_anum().

◆ enc_ascii_check_printable()

int enc_ascii_check_printable ( const char *  s)

Check for printable ASCII characters.

Parameters
[in]sString to check

HT (9) and SPACE (32, 0x20) inside s are treated as "printable" to make this function suitable to check header field bodies according to RFC 5322.

Note
The function enc_ascii_convert_to_printable() can be used on error.
Returns
  • 0 on success
  • Negative value on error

Definition at line 5022 of file encoding.c.

Referenced by enc_mime_word_encode(), and ext_download_file().

◆ enc_ascii_convert_distribution()

void enc_ascii_convert_distribution ( char *  s)

Convert body of distribution header field.

Parameters
[in]sString with unfolded body to convert

This function process s in-place. The result will always be shorter or same length as the original data.

Every element of dist-list that contains invalid characters is removed.

Definition at line 5074 of file encoding.c.

References enc_ascii_check_alpha(), enc_ascii_check_digit(), and PRINT_ERROR.

◆ enc_ascii_convert_to_printable()

void enc_ascii_convert_to_printable ( char *  s)

Convert to printable ASCII format.

Parameters
[in]sString to convert

This function should be used to repair a string in-place after the function enc_ascii_check_printable() have reported an error.

Every invalid byte is replaced with '?'.

Definition at line 5049 of file encoding.c.

◆ enc_convert_anum_to_ascii()

int enc_convert_anum_to_ascii ( char  result[17],
size_t *  len,
core_anum_t  wm 
)

Convert article number from numerical format to ASCII.

Parameters
[out]resultPointer to result string buffer (Size: 17 bytes)
[out]lenPointer to length of result string (Maximum value: 16)
[in]wmArticle number (watermark) to convert

RFC 3977 allows max. 16 digits.

Note
The output is locale independent.
Returns
  • 0 on success
  • Negative value on error (result and len are not valid)

Definition at line 4558 of file encoding.c.

References CORE_ANUM_T_MAX, and PRINT_ERROR.

◆ enc_convert_ascii_to_anum()

int enc_convert_ascii_to_anum ( core_anum_t result,
const char *  wm,
int  len 
)

Convert number from ASCII to numerical format.

Parameters
[out]resultPointer to result
[in]wmArticle number (watermark) string to convert
[in]lenLength of string wm

Max. 20 digits are supported, sufficient for 64-bit article numbers. RFC 3977 allows max. 16 digits.

This function correctly processes leading zeros and does not use standard library functions with locale dependent behaviour.

Note
wm needs no termination, the first len characters are used.
Returns
  • 0 on success
  • Negative value on error
  • -2 means larger than NNTP_ANUM_T_MAX

Definition at line 4604 of file encoding.c.

References enc_ascii_check_digit(), NNTP_ANUM_T_MAX, and PRINT_ERROR.

◆ enc_convert_canonical_to_posix()

const char* enc_convert_canonical_to_posix ( const char *  s,
int  rcr,
int  rlf 
)

Convert from canonical (RFC 822) to local (POSIX) form.

Parameters
[in]sString to convert
[in]rcrReplace invalid CR control characters if nonzero
[in]rlfReplace invalid LF control characters if nonzero

According to RFC 822 and RFC 2049 this function accepts plain text article content in canonical form and convert the CRLF line breaks to local (POSIX, single LF) form.

Attention
Single CR and LF control characters (not part of a CRLF sequence) are forbidden in canonical format of text by RFC 2045 and RFC 2046. Default behaviour is to preserve single CR and LF control characters. The Unicode codepoint defined by ENC_RC can be inserted as replacement for CR or/and LF by setting rcr or/and rlf respectively to a nonzero value.

On success the caller is responsible to free the allocated memory.

Returns
  • Pointer to decoded data (a new memory block was allocated)
  • NULL on error

Definition at line 5579 of file encoding.c.

Referenced by core_convert_canonical_to_posix().

◆ enc_convert_iso8601_to_posix()

int enc_convert_iso8601_to_posix ( core_time_t pts,
const char *  isodate 
)

Convert ISO 8601 conformant UTC date and time to POSIX timestamp.

Parameters
[out]ptsSeconds since epoche (as defined by POSIX.1)
[in]isodateBuffer for date string (at least 20 characters)
Attention
The parameter isodate must be in YYYY-MM-DDTHH-MM-SSZ format (UTC).
Note
This function accepts no date input before the epoche.
Returns
  • 0 on success
  • Negative value on error

Definition at line 4450 of file encoding.c.

References PRINT_ERROR.

◆ enc_convert_iso8601_to_timestamp()

int enc_convert_iso8601_to_timestamp ( const char **  ts,
const char *  isodate 
)

Convert ISO 8601 conformant date to canonical timestamp.

Parameters
[out]tsPointer to canonical timestamp as defined by RFC 5322
[in]isodateISO 8601 date string (exactly 10 characters)
Attention
The parameter isodate must be in YYYY-MM-DD format (only date, time is not supported).
Note
On success, the caller is responsible to free the memory allocated for the result string.
Returns
  • 0 on success
  • Negative value on error

Definition at line 4503 of file encoding.c.

References PRINT_ERROR.

◆ enc_convert_lines_to_string()

void enc_convert_lines_to_string ( char *  l,
unsigned long int  l_raw 
)

Convert number of lines to string.

Parameters
[out]lPointer to result buffer (at least 11 characters large)
[in]l_rawNumber of lines
Attention
The value of l_raw must be representable as decimal number with not more than 10 digits. Otherwise the string "Error" is returned.

Definition at line 4119 of file encoding.c.

◆ enc_convert_octet_to_hex()

int enc_convert_octet_to_hex ( char *  result,
unsigned int  octet 
)

Convert octet to hexadecimal (ASCII) format.

Parameters
[out]resultPointer to result
[in]octetOctet to convert

Exactly 3 bytes are written to the buffer pointed to by result . If octet is smaller than 16, a leading zero is created. On error, the result "XX" is generated. The result is always a zero terminated string.

Returns
  • 0 on success
  • Negative value on error

Definition at line 4664 of file encoding.c.

◆ enc_convert_posix_to_canonical()

const char* enc_convert_posix_to_canonical ( const char *  s)

Convert from local (POSIX) to canonical (RFC 822) form.

Parameters
[in]sString to convert

According to RFC 822 and RFC 2049 this function accepts plain text article content in local (POSIX) form and convert the single LF line breaks to canonical (CRLF) form.

According to RFC 2045 and RFC 2046 single CR characters are deleted.

On success the caller is responsible to free the allocated memory.

Returns
  • Pointer to decoded data (a new memory block was allocated)
  • NULL on error

Definition at line 5695 of file encoding.c.

References PRINT_ERROR.

Referenced by core_convert_posix_to_canonical().

◆ enc_convert_posix_to_iso8601()

int enc_convert_posix_to_iso8601 ( char *  isodate,
core_time_t  pts 
)

Convert POSIX timestamp to ISO 8601 conformant local date and time.

Parameters
[out]isodateBuffer for date string (at least 20 characters)
[in]ptsSeconds since epoche (as defined by POSIX.1)

ISO 8601 allows to omit the 'T' character between the date and time fields if there is no risk of confusing a date and time of day representation. This is the case here => We omit the 'T' for better human readability

Returns
  • 0 on success
  • Negative value on error
Todo:
Calling operating system for date conversion should be replaced until the year 2038 (when 32 bit signed time_t implementations will overflow).

Definition at line 4342 of file encoding.c.

References PRINT_ERROR.

◆ enc_convert_to_8bit()

const char* enc_convert_to_8bit ( enum enc_mime_cs charset,
const char *  s,
const char **  cs_iana 
)

Convert string from Unicode (UTF-8 NFC) to an 8bit character set.

Parameters
[out]charsetPointer to character set of result (or NULL)
[in]sUnicode string to convert in UTF-8 NFC format
[out]cs_ianaPointer to IANA charset name of result (or NULL)
Attention
Ensure that the string s is valid UTF-8 and normalized to NFC. Otherwise this function will not work as expected.

According to RFC 2046 the following rules are applied:

  • In general, composition software should always use the "lowest common denominator" character set possible => We do so by preferring the widely supported ISO 8859-1 character set.
Note
If this function supports more character sets in the future, ISO 8859-1 must always stay the preferred one (because this is our fallback locale character set to allow the use of POSIX regular expressions without Unicode support from the system).

If NULL is passed as parameter charset or cs_iana , this indicates that the caller is not interested in this information. The corresponding data is discarded in this case.

Returns
  • Pointer to encoded data (the character set is written to charset) If the result is not equal to s , a new memory block was allocated
  • NULL on error (Original memory block for s is still allocated) Nothing is written to charset and cs_iana in this case

Definition at line 6005 of file encoding.c.

Referenced by core_post_article().

◆ enc_convert_to_utf8_nfc()

const char* enc_convert_to_utf8_nfc ( enum enc_mime_cs  charset,
const char *  s 
)

Convert string from supported character set to Unicode (UTF-8 NFC)

Parameters
[in]charsetCharacter set of string s
[in]sString to convert

According to RFC 2049 the following rules are applied:

  • For all character sets from the ISO 8859 family that are not supported, at least the ASCII characters must be decoded correctly => We decode all non ASCII characters as "?" in this case.

According to RFC 3629 the following rules are applied:

  • If the input data is already UTF-8 is is not allowed to accept it unchecked. It is mandatory to check the validity of the encoding => We do so.
Note
Some control characters that may cause problems are removed.
Returns
  • Pointer to decoded Unicode data (UTF-8 encoded with NFC normalization) If the result is not equal to s , a new memory block was allocated
  • NULL on error (Original memory block for s is still allocated)

Definition at line 5788 of file encoding.c.

Referenced by core_post_article().

◆ enc_create_name_addr()

const char* enc_create_name_addr ( const char *  data,
size_t  offset 
)

Create a "name-addr" construct according to RFC 5322.

This function is intended to create the "From" and "Reply-To" header fields.

Parameters
[in]dataInput data
[in]offsetFolding offset, e.g. sizeof("From: ")

The input data must have the following format: name <addr-spec> .

Attention
The addr-spec construct is not allowed to contain comments or quoted strings. Both parts, name and <addr-spec> must fit on a single header line of 998 characters. Note that offset adds to the length of name .

name must be an Unicode identifier corresponding to addr-spec . If it contains non-ASCII characters, it is converted to a valid display-name token. The result will be folded according to RFC 2047.

On success the caller is responsible to free the memory allocated for the result.

Returns
  • Pointer to encoded data (a new memory block was allocated)
  • NULL on error

Definition at line 3952 of file encoding.c.

◆ enc_create_wildmat()

int enc_create_wildmat ( struct enc_wm_pattern **  obj,
const char *  wm 
)

Create wildmat pattern array.

Parameters
[out]objPointer to wildmat pattern array
[in]wmRFC 3977 conformant wildmat

This function splits a RFC 3977 conformant wildmat into its elements of type wildmat-pattern . Every wildmat-pattern is converted to a POSIX extended regular expression and stored together with a negation flag (that is set if the wildmat-pattern was preceded by an exclamation mark) in the array obj .

On success the caller is responsible to free the memoy allocated for the resulting array with the function enc_destory_wildmat() .

Attention
If the wildmat wm contains Unicode data, it must be normalized to NFC by the caller.
Returns
  • Number of patterns in the object on success
  • Negative value on error (NULL was written to obj)

Definition at line 5371 of file encoding.c.

References enc_destroy_wildmat(), enc_uc_check_utf8(), and PRINT_ERROR.

◆ enc_destroy_wildmat()

void enc_destroy_wildmat ( struct enc_wm_pattern **  obj,
int  num 
)

Destroy wildmat pattern array.

Parameters
[in,out]objPointer to wildmat pattern array
[in]numNumber of elements in array

NULL is written to the location pointed to by obj after releasing the memory allocated for the array.

Definition at line 5537 of file encoding.c.

Referenced by enc_create_wildmat().

◆ enc_extract_addr_spec()

const char* enc_extract_addr_spec ( const char *  mailbox)

Extract addr-spec token from RFC 5322 mailbox.

Parameters
[in]mailboxRFC 5322 mailbox
Attention
The checks are more restrictive than the formal specification of RFC 5322. White space is not allowed inside the addr-spec token!
Note
It is tolerated that mailbox contains an invalid name-addr token because it is ignored anyway.

On success a pointer to the result buffer is returned. The caller is responsible to free the memory allocated for this buffer.

Returns
  • Pointer to new memory block containing the addr-spec token
  • NULL on error

Definition at line 4830 of file encoding.c.

◆ enc_free()

void enc_free ( void *  p)

Free an object allocated by encoding module.

Use this function to release dynamic memory that was allocated by the encoding module.

Parameters
[in]pPointer to object

Release the memory for the object pointed to by p.

Note
The pointer p is allowed to be NULL and no operation is performed in this case.

Definition at line 8868 of file encoding.c.

Referenced by core_get_cancel_key(), core_get_cancel_lock(), core_get_msgid(), and core_post_article().

◆ enc_get_iso8601_utc()

int enc_get_iso8601_utc ( char *  isodate)

Get current UTC date in ISO 8601 conformant format.

Parameters
[out]isodateBuffer for date string (at least 21 characters)

The date is written to isodate in YYYY-MM-DDTHH-MM-SSZ format.

Returns
  • 0 on success
  • Negative value on error
Todo:
Calling operating system for date conversion should be replaced until the year 2038 (when 32 bit signed time_t implementations will overflow).

Definition at line 4395 of file encoding.c.

◆ enc_lines_decode()

unsigned long int enc_lines_decode ( const char *  lines)

Decode number of lines.

Parameters
[in]linesNumber of lines

lines must be a RFC 5536 conformant body of the (now obsolete) "Lines" header field.

Returns
  • Number of lines
  • 0 on error

Definition at line 4098 of file encoding.c.

◆ enc_mime_decode()

const char* enc_mime_decode ( enum enc_mime_cte  cte,
enum enc_mime_cs  charset,
const char *  s 
)

Decode MIME text content to UTF-8 NFC.

Parameters
[in]cteMIME content transfer encoding
[in]charsetMIME character set
[in]sMIME encoded data

According to RFC 2049 all transfer encodings not defined in MIME 1.0 are rejected.

Returns
  • Pointer to decoded data. If the result is not equal to s , a new memory block was allocated
  • NULL on error (Original memory block for s is still allocated)

Definition at line 7801 of file encoding.c.

References ENC_CTE_Q.

◆ enc_mime_encode_base64()

int enc_mime_encode_base64 ( const char **  enc,
const char *  data,
size_t  len 
)

Encode binary data to base64.

Parameters
[out]encPointer to result (zero terminated string)
[in]dataData to encode
[in]lenData length

If len is zero, data is not dereferenced and the result will be an empty string.

On error, nothing is written to enc .

On success a pointer to the result buffer will be written to enc . The caller is responsible to free the memory allocated for this buffer.

Returns
  • 0 on success
  • Negative value on error

Definition at line 4744 of file encoding.c.

References data.

Referenced by core_get_cancel_key(), core_get_cancel_lock(), core_get_msgid(), and digest_randomart().

◆ enc_mime_flowed_decode()

const char* enc_mime_flowed_decode ( const char *  s,
unsigned int  delsp,
unsigned int  insline 
)

Decode MIME "text/plain" content with "format=flowed" parameter.

Parameters
[in]sMIME encoded data in canonical form
[in]delspDelete spaces at EOL if nonzero
[in]inslineAdd empty line separator after paragraphs if nonzero
Attention
The encoding of the data referenced by s must be valid Unicode in UTF-8 representation. This must be checked by the caller before this function is used.
Returns
  • Pointer to decoded data (if the result is not equal to s , a new memory block was allocated)
  • NULL on error (Original memory block for s is still allocated)

Definition at line 7856 of file encoding.c.

References CONF_QUOTESTYLE, config, and PRINT_ERROR.

◆ enc_mime_get_cd()

void enc_mime_get_cd ( const char *  hf_body,
enum enc_mime_cd type,
const char **  filename 
)

Decode content disposition.

Parameters
[in]hf_bodyBody of Content-Disposition header field
[out]typePointer to content disposition type ID
[out]filenamePointer to filename

The field body hf_body must be unfolded and preprocessed (parameters must ne already decoded according to RFC 2231). The value for the filename parameter must be already converted to UTF-8.

If a filename parameter is present, a new memory block is allocated for filename . Otherwise NULL is returned.

Definition at line 7619 of file encoding.c.

◆ enc_mime_get_ct()

void enc_mime_get_ct ( struct enc_mime_ct ct,
const char *  hf_body,
char *  bo 
)

Decode MIME "Content-Type" header field.

Parameters
[out]ctPointer to result structure
[in]hf_bodyHeader field body that contains the MIME content type
[out]boPointer to buffer for multipart boundary delimiter

The header field body hf_body is decoded and content IDs are written to the structure pointed to by ct .

The buffer for the boundary string used in messages with content type "multipart" must be allocated by the caller with a size of at least ENC_BO_BUFLEN and a pointer to the start of this buffer must be passed as bo parameter. It is allowed to pass NULL for bo if the caller is not interested in the boundary string.

According to RFC 2045 the following rules are applied:

  • If the content type is not present, "text/plain" and "US-ASCII" must be used as default => We do so.

According to RFC 2046 the following rules are applied:

  • The content type and subtype must be treated case insensitive => We do so.
  • The parameter names must be treated case insensitive => We do so.
  • The default character set must be assumed as "US-ASCII" if the "charset" parameter is missing for "text/plain" content type => We do so.

According to RFC 3676 the following rules are applied:

  • The values of parameters "Format" and "DelSp" must be treated case insensitive => We do so.
  • The parameter "DelSp" should be ignored if content type is not "text/plain" with "format=flowed" => We do so.

The experimental parameter "InsLine" set to "yes" adds an empty line separator after every paragraph that end with an empty line. This allows to declare single lines as paragraphs, e.g. for Smartphones, without losing the separation to the following text (or creating double empty line separation in compatibility view).

Note
This function never fails, instead ENC_xxx_UNKNOWN IDs are returned.

Definition at line 7235 of file encoding.c.

◆ enc_mime_get_cte()

enum enc_mime_cte enc_mime_get_cte ( const char *  hf_body)

Decode content transfer encoding description.

Parameters
[in]hf_bodyMIME content transfer encoding description string

This function checks whether the string hf_body represents a supported content transfer encoding and return the corresponding ID for it. According to RFC 2047 the content transfer encoding is treated case-insensitive.

Note
It is allowed to call this function with hf_body set to NULL. This is treated as an error and the return value will indicate an unknown transfer encoding.
RFC 2049 requires that every non-7bit MIME content must be labeled with a content transfer encoding header field of "8bit" or "binary".
Returns
  • MIME content transfer encoding ID (from enc_mime_cte )
  • ENC_CTE_UNKNOWN on error

Definition at line 7536 of file encoding.c.

References ENC_CTE_BIN, and ENC_CTE_BUFLEN.

◆ enc_mime_message()

size_t enc_mime_message ( const char *  s,
size_t  len,
struct enc_mime_mpe **  mpe 
)

Extract MIME encapsulated message.

Parameters
[in]sMIME encapsulated message
[in]lenLength of encapsulated message
[out]mpeMIME multipart entity locations

On success a pointer to the result array is written to mpe . The caller is responsible to free the memory allocated for this array.

Returns
  • 1 on success
  • 0 on error

Definition at line 8168 of file encoding.c.

References enc_mime_mpe::len, PRINT_ERROR, and enc_mime_mpe::start.

◆ enc_mime_multipart()

size_t enc_mime_multipart ( const char *  s,
const char *  b,
struct enc_mime_mpe **  mpe 
)

Parse MIME multipart content.

Parameters
[in]sMIME encoded multipart data
[in]bMIME boundary delimiter
[out]mpeMIME multipart entity locations

On success a pointer to the result array is written to mpe . The caller is responsible to free the memory allocated for this array.

Returns
  • Nonzero number of entities in multipart data on success
  • 0 on error

Definition at line 8210 of file encoding.c.

References ENC_BO_BUFLEN, enc_mime_mpe::len, PRINT_ERROR, and enc_mime_mpe::start.

◆ enc_mime_para_decode()

int enc_mime_para_decode ( const char **  r,
const char *  b,
int  m 
)

Decode header field containing potential MIME parameters.

Parameters
[out]rPointer to result string pointer
[in]bPrepared header field body that contains potential parameters
[in]mOperating mode (see description below)

The parameter m enable special processing if set to a nonzero value. m should be set to 1 for the Content-Type header field.

Attention
This function must be called after unfolding the field body, with comments stripped and after decoding of quoted-string tokens. Whitespace must already be merged into the semantically equivalent single SP (and removed completely before semicolons and around equal signs) by the caller.

According to RFC 2231 the following rules are applied:

  • Parameters can be split into multiple sections which can be listed in arbitrary order inside the header field body => We accept parameter sections in any order and merge them in ascending order.
  • Parameter sections are allowed to contain literal content as well as quoted-string tokens. Mixing sections of both types is allowed => quoted-string tokens must already be decoded in b by the caller.
  • Parameters can contain character set information => We accept content in any supported character set and decode it to Unicode NFC (non-US_ASCII octets of unsupported character sets are decoded to the underscore character).
  • Parameter can contain language information => We accept and ignore it.

According to RFC 3629 the following rules are applied:

  • If the content of a parameter is UTF-8 encoded, it is is not allowed to accept it unchecked. It is mandatory to check the validity of the encoding => We do so.

On success, the address of the result buffer is written to the location pointed to by r (this may be the same as b if there is nothing to do). The caller is responsible to free the potentially allocated memory. On error NULL is written to the location pointed to by r .

Returns
  • 0 on success if something was decoded and a new memory block was allocated
  • 1 on success if there was nothing to decode and no memory was allocated
  • -1 on error

Definition at line 6817 of file encoding.c.

References ENC_MIME_PARA_LENGTH_MAX, and PRINT_ERROR.

◆ enc_mime_save_to_file()

int enc_mime_save_to_file ( const char *  pn,
enum enc_mime_cte  cte,
const char *  entity 
)

Decode MIME content transfer encoding and save to file.

Parameters
[in]pnPathname of file
[in]cteMIME content transfer encoding
[in]entityMIME entity body

According to RFC 2049 all transfer encodings not defined in MIME 1.0 are rejected.

Returns
  • 0 on success
  • -1 on error

Definition at line 7717 of file encoding.c.

References ENC_CTE_Q.

◆ enc_mime_word_decode()

int enc_mime_word_decode ( const char **  r,
const char *  b 
)

Decode header field containing potential MIME encoded-word tokens.

Parameters
[out]rPointer to result string pointer
[in]bHeader field body that contains potential encoded-words

The header field body b must be unfolded before calling this function.

According to RFC 2047 the following rules are applied:

  • An encoded-word is not allowed to be longer than 75 characters => We decode encoded-word of arbitrary length.
  • An encoded-word not at the beginning can start after a 'linear-white-space' token => We resync the parser after every white space.
  • Any amount of linear-space-white between 'encoded-word's must be ignored => We do so.
  • The character set and encoding fields must be treated case-insensitive => We do so.
  • All character sets from the ISO 8859 family that are not supported must be handled in a way that contained ASCII characters are decoded correctly => We do so.

According to RFC 3629 the following rules are applied:

  • If the content of an encoded word is UTF-8 encoded, it is is not allowed to accept it unchecked. It is mandatory to check the validity of the encoding => We do so.

On success, the address of the result buffer is written to the location pointed to by r (this may be the same as b if there is nothing to do). The caller is responsible to free the potentially allocated memory. On error NULL is written to the location pointed to by r .

Returns
  • 0 on success if something was decoded and a new memory block was allocated
  • 1 on success if there was nothing to decode and no memory was allocated
  • -1 on error

Definition at line 6518 of file encoding.c.

◆ enc_mime_word_encode()

int enc_mime_word_encode ( const char **  r,
const char *  b,
size_t  pl 
)

Encode header field body using MIME encoded-word tokens.

This function use quoted-printable encoding.

Parameters
[out]rPointer to result string pointer
[in]bHeader field body that contains potential Unicode data
[in]plLength of header field prefix (Length limit: 25)

The header field body b must be verified by the caller to be valid UTF-8 (this function will do the normalization to NFC). The CRLF termination must be removed before calling this function.

The length pl must include the header field name, the colon and any potential white space not included in b .

According to RFC 5536 the following rules are applied:

  • A header field line is not allowed to be empty => The header field is never folded immediately after the name separator.
  • Lines are not allowed to contain more than 1000 characters => We respect this by rejecting words that are longer than 998 characters.

According to RFC 2047 the following rules are applied:

  • White space between encoded-words is semantically ignored => A single space between encoded-words is included in the trailing word, additional LWSP characters are included into the leading word.
  • A header line containing encoded-words must be no longer than 76 characters => We fold before this limit.
  • If folding is required, each encoded-word must contain an integral number of characters and must be self-contained => We only split between Unicode combining character sequences when using UTF-8 (between grapheme clusters would be better, but is not supported yet)
  • If there is more than one character set that can represent the 8-bit content of an encoded-word, ISO 8859 should be preferred => We do so if the required ISO 8859 encoder is available (can be disabled with the force_unicode option in configfile).
  • If encoded-word is not used because of 8-bit data, US-ASCII should be used => We do so (can be disabled with the force_unicode option in configfile).

According to RFC 5198 the following rules are applied:

  • It's recommended to use NFC normalization in general Internet text messages => We do so.

On success, the address of the result buffer is written to the location pointed to by r (this may be the same as b if there is nothing to do). The caller is responsible to free the potentially allocated memory. On error NULL is written to the location pointed to by r .

Returns
  • 0 on success if a new memory block was allocated
  • 1 on success if there was nothing to encode and no memory was allocated
  • -1 on error

Definition at line 6103 of file encoding.c.

References enc_ascii_check_printable(), ENC_CS_UTF_8, enc_uc_check_utf8(), and PRINT_ERROR.

◆ enc_percent_decode()

int enc_percent_decode ( char *  s,
int  clean 
)

Percent decoder.

Parameters
[in]sString to decode (URI or MIME parameter value)
[in]cleanReplace NUL and ';' with '_' if nonzero
Note
The data is decoded in place because it can't be larger after the decoding operation.

If s is NULL no operation is performed and success is returned.

Returns
  • Positive value on success (if data in s was decoded)
  • 0 on success (if there was nothing to do)
  • Negative value if percent encoding in s is invalid

Definition at line 8318 of file encoding.c.

References enc_mime_mpe::len.

◆ enc_rot13()

void enc_rot13 ( char *  data)

Encode or decode data with ROT13 algorithm.

Parameters
[in]dataPointer to buffer with Data to encode/decode

Any character that is not a latin ASCII character in the ranges A..Z and a..z will stay unchanged.

No memory is allocated. The operation is executed in the buffer pointed to by data .

Definition at line 4692 of file encoding.c.

References data.

◆ enc_timestamp_decode()

core_time_t enc_timestamp_decode ( const char *  timestamp)

Decode canonical timestamp to POSIX time (seconds since epoche)

According to RFC 5322 all military timezones should be treated as UTC because there was an error in RFC 822 => We do so and accept "Z" as valid because it means UTC

Note
This function accepts no timestamps before the epoche (the Usenet has not existed yet at that time).
Parameters
[in]timestampRFC 5536 conformant timestamp string
Returns
  • Seconds since epoche (as defined by POSIX.1)
  • 0 on error

Definition at line 4154 of file encoding.c.

References PRINT_ERROR.

◆ enc_uc_check_utf8()

int enc_uc_check_utf8 ( const char *  s)

Verify UTF-8 encoding.

Parameters
[in]sString to verify
Attention
Read chapter 10 of RFC 3629 for UTF-8 security considerations.

According to RFC 3629 the following rules are applied:

  • Character code points beyond 0x10FFFF are invalid => We reject them.
  • Only the shortest possible code sequence is allowed => We verify this.
  • Surrogate character code points are invalid for UTF-8 => We reject them.
Returns
  • 0 on success
  • Negative value on error

Definition at line 5162 of file encoding.c.

Referenced by core_get_signature(), enc_create_wildmat(), enc_mime_word_encode(), and enc_uc_repair_utf8().

◆ enc_uc_encode_utf8()

void enc_uc_encode_utf8 ( char *  buf,
size_t *  i,
long int *  dbuf,
size_t *  di 
)

Encode Unicode codepoints to UTF-8.

Parameters
[out]bufEncoded UTF-8 string
[in,out]iCurrent index in buf
[in]dbufCodepoint buffer
[in,out]diNumber of codepoints in dbuf
Attention
The target buffer buf must be large enough for the encoded data. This must be ensured by the caller using worst case calculations.

On success, the start index of the next codepoint is written to the location pointed to by i and zero is written to the location pointed to by di .

Definition at line 1008 of file encoding.c.

References data, and PRINT_ERROR.

◆ enc_uc_repair_utf8()

const char* enc_uc_repair_utf8 ( const char *  s)

Repair UTF-8 encoding.

Parameters
[in]sString to repair

Invalid UTF-8 sequences and invalid codepoints are replaced with U+FFFD.

Returns
  • Pointer to new memory block on success
  • NULL on error

Definition at line 5181 of file encoding.c.

References enc_uc_check_utf8(), and PRINT_ERROR.

◆ enc_uri_percent_encode()

const char* enc_uri_percent_encode ( const char *  s,
enum enc_uri_scheme  sch 
)

Percent encoding for URI content.

Parameters
[in]sURI body to encode
[in]schURI scheme

Passing NULL for parameter s is allowed and treated as error.

Generic URI syntax is defined in RFC 3986.
The scheme "ftp" is defined in RFC 1738.
The scheme "http" is defined in RFC 7230.
The scheme "mailto" is defined in RFC 6068.
The scheme "news" is defined in RFC 5538.

The following characters are percent encoded:

  • Space (not allowed for "mailto" and "news" schemes)
  • The literal percent sign
  • The list "gen-delims" defined in RFC 3986
  • Anything not in the list "unreserved" for "http" and "ftp" schemes
  • For the "mailto" scheme exactly one "commercial at" sign is required and treated literally
  • For the "news" scheme a single "commercial at" sign is accepted literally
Returns
  • Pointer to result on success. If the result is not equal to s , a new memory block was allocated
  • NULL on error

Definition at line 8402 of file encoding.c.

References ENC_URI_SCHEME_FTP, ENC_URI_SCHEME_HTTP, ENC_URI_SCHEME_MAILTO, ENC_URI_SCHEME_NEWS, and PRINT_ERROR.


Generated at 2024-04-27 using  doxygen