Lexbor: webscraping an HTML table in C

Goal:

To find and print one value between <td> tags from a HTML table using lexbor. The details and source code of Lexbor can be found here.

Further details:

There are many <td> tags, and each one is represented by a unique header. A simple example can be seen below, where only the first column value of 0.7 is of interest (i.e. the tag with header="choose-this-header").

<table>
<tbody>
<tr>
    <td header="choose-this-header">0.7</td>
    <td header="ignore-this-header">1.3</td>
    <td header="ignore-this-header">5.4</td>
</tr>
</tbody>
</table>

Therefore, it seems that the best approach to find this value is to:

  1. Search through the HTML for the element where header="chosen-header"
  2. Isolate this line of HTML, and extract the value between the <td>...</td> tags

Problem:

Based on this lexbor example, step_one.c seen below successfully detects the line of HTML containing the required header, but it is printed to the terminal as <td header="choose-this-header"> without the text value or closing </td> tag. If there is a way to save the entire line (i.e. <td header="choose-this-header">0.7</td>) into a buffer, the programstep_two.c based on this example further below could be used to extract the text value of 0.7.

step_one.c

#include "base.h"

#include <lexbor/dom/dom.h>


static void
print_collection_elements(lxb_dom_collection_t *collection)
{
    lxb_dom_element_t *element;

    for (size_t i = 0; i < lxb_dom_collection_length(collection); i++) {
        element = lxb_dom_collection_element(collection, i);

        serialize_node(lxb_dom_interface_node(element));
    }

    lxb_dom_collection_clean(collection);
}

int
main(int argc, const char *argv[])
{
    lxb_status_t status;
    lxb_dom_element_t *body;
    lxb_html_document_t *document;
    lxb_dom_collection_t *collection;

    const lxb_char_t html[] = "<table>"
            "<tbody>"
            "<tr>"
            "<td header="choose-this-header">0.7</td>"
            "<td header="ignore-this-header">1.3</td>"
            "<td header="ignore-this-header">5.4</td>"
            "</tr>"
            "</tbody>"
            "</table>";

    size_t html_szie = sizeof(html) - 1;

    PRINT("HTML:");
    PRINT("%s", (const char *) html);

    document = parse(html, html_szie);

    body = lxb_dom_interface_element(document->body);

    collection = lxb_dom_collection_make(&document->dom_document, 128);
    if (collection == NULL) {
        FAILED("Failed to create Collection object");
    }

    /* Full match */
    status = lxb_dom_elements_by_attr(body, collection,
                                      (const lxb_char_t *) "header", 6,
                                      (const lxb_char_t *) "choose-this-header", 18,
                                      true);
    if (status != LXB_STATUS_OK) {
        FAILED("Failed to get elements by name");
    }

    PRINT("nFull match by 'choose-this-header':");
    print_collection_elements(collection);

    lxb_dom_collection_destroy(collection, true);
    lxb_html_document_destroy(document);

    return 0;
}

step_one.c output:

HTML:
<table><tbody><tr><td header="choose-this-header">0.7</td><td header="ignore-this-header">1.3</td><td header="ignore-this-header">5.4</td></tr></tbody></table>

Full match by 'choose-this-header':
<td header="choose-this-header"> // no text value or closing tag is printed

step_two.c

#include "lexbor/html/tokenizer.h"


#define FAILED(...)                                                            
    do {                                                                       
        fprintf(stderr, __VA_ARGS__);                                          
        fprintf(stderr, "n");                                                 
        exit(EXIT_FAILURE);                                                    
    }                                                                          
    while (0)


static lxb_html_token_t *
token_callback(lxb_html_tokenizer_t *tkz, lxb_html_token_t *token, void *ctx)
{
    /* Skip all not #text tokens */
    if (token->tag_id != LXB_TAG__TEXT) {
        return token;
    }

    printf("%.*s", (int) (token->text_end - token->text_start),
           token->text_start);

    return token;
}

int
main(int argc, const char *argv[])
{
    lxb_status_t status;
    lxb_html_tokenizer_t *tkz;

    const lxb_char_t data[] = "<td headers="choose-this-header">0.7</td>";

    printf("HTML:n%snn", (char *) data);
    printf("Result:n");

    tkz = lxb_html_tokenizer_create();
    status = lxb_html_tokenizer_init(tkz);
    if (status != LXB_STATUS_OK) {
        FAILED("Failed to create tokenizer object");
    }

    /* Set callback for token */
    lxb_html_tokenizer_callback_token_done_set(tkz, token_callback, NULL);

    status = lxb_html_tokenizer_begin(tkz);
    if (status != LXB_STATUS_OK) {
        FAILED("Failed to prepare tokenizer object for parsing");
    }

    status = lxb_html_tokenizer_chunk(tkz, data, (sizeof(data) - 1));
    if (status != LXB_STATUS_OK) {
        FAILED("Failed to parse the html data");
    }

    status = lxb_html_tokenizer_end(tkz);
    if (status != LXB_STATUS_OK) {
        FAILED("Failed to ending of parsing the html data");
    }

    printf("n");

    lxb_html_tokenizer_destroy(tkz);

    return 0;
}

step_two.c output:

HTML:
<td headers="choose-this-header">0.7</td>

Result:
0.7

Additional details:

  1. Preferable to stay using lexbor because it is fast
  2. Using Ubuntu 20.04.1 LTS
  3. Compile with gcc myprogram.c -llexbor -o myprogram
  4. Installation instructions for various OS found here

Summary question:

Q1. How can the program step_one.c be modified to save the ENTIRE line into a buffer? Once this has been achieved, it will be relatively simple to combine the two programs into one where the variable data[] in step_two.c will be the entire line found using the logic seen in step_one.c.

Answers:

Thank you for visiting the Q&A section on Magenaut. Please note that all the answers may not help you solve the issue immediately. So please treat them as advisements. If you found the post helpful (or not), leave a comment & I’ll get back to you as soon as possible.

Method 1

Here’s an example:

#include <lexbor/html/html.h>
#include <lexbor/css/css.h>
#include <lexbor/selectors/selectors.h>


lxb_status_t
callback(const lxb_char_t *data, size_t len, void *ctx)
{
    printf("%.*s", (int) len, (const char *) data);

    return LXB_STATUS_OK;
}

lxb_status_t
find_callback(lxb_dom_node_t *node, lxb_css_selector_specificity_t *spec,
              void *ctx)
{
    printf("Tag:n");

    /* Print only <td> tag. */
    (void) lxb_html_serialize_cb(node, callback, NULL);

    printf("nnTag with children:n");

    /* Print <td> element and all children in <td>. */
    (void) lxb_html_serialize_tree_cb(node, callback, NULL);

    printf("nnChildren:n");

    /* Print children in <td>. */
    (void) lxb_html_serialize_deep_cb(node, callback, NULL);

    /* Use lxb_html_serialize_*_str(...) for buffer. */

    return LXB_STATUS_OK;
}

int main(void) {
    lxb_status_t status;
    lxb_dom_node_t *body;
    lxb_html_document_t *document;
    lxb_css_parser_t *parser;
    lxb_selectors_t *selectors;
    lxb_css_selector_list_t *list;

    const lxb_char_t html[] = "<table>"
            "<tbody>"
            "<tr>"
            "<td header="choose-this-header">0.7</td>"
            "<td header="ignore-this-header">1.3</td>"
            "<td header="ignore-this-header">5.4</td>"
            "</tr>"
            "</tbody>"
            "</table>";

    static const lxb_char_t slctrs[] = "td[header='choose-this-header']";

    document = lxb_html_document_create();
    if (document == NULL) {
        return EXIT_FAILURE;
    }

    status = lxb_html_document_parse(document, html, sizeof(html) - 1);
    if (status != LXB_STATUS_OK) {
        return EXIT_FAILURE;
    }

    /* Create CSS parser. */

    parser = lxb_css_parser_create();
    status = lxb_css_parser_init(parser, NULL, NULL);
    if (status != LXB_STATUS_OK) {
        return EXIT_FAILURE;
    }

    /* Selectors. */

    selectors = lxb_selectors_create();
    status = lxb_selectors_init(selectors);
    if (status != LXB_STATUS_OK) {
        return EXIT_FAILURE;
    }

    list = lxb_css_selectors_parse(parser, slctrs, sizeof(slctrs) - 1);
    if (parser->status != LXB_STATUS_OK) {
        return EXIT_FAILURE;
    }

    /* Find DOM/HTML nodes by selectors. */

    body = lxb_dom_interface_node(lxb_html_document_body_element(document));
    if (body == NULL) {
        return EXIT_FAILURE;
    }

//    lxb_html_serialize_deep_cb(body, callback, NULL);

    status = lxb_selectors_find(selectors, body, list, find_callback, NULL);
    if (status != LXB_STATUS_OK) {
        return EXIT_FAILURE;
    }

    printf("n");

    /* Destroy Selectors object. */
    (void) lxb_selectors_destroy(selectors, true);

    /* Destroy resources for CSS Parser. */
    (void) lxb_css_parser_destroy(parser, true);

    /* Destroy all Selector List memory. */
    lxb_css_selector_list_destroy_memory(list);

    /* Destroy HTML Document. */
    lxb_html_document_destroy(document);

    return 0;
}

Output:

Tag:
<td header="choose-this-header">

Tag with children:
<td header="choose-this-header">0.7</td>

Children:
0.7


All methods was sourced from stackoverflow.com or stackexchange.com, is licensed under cc by-sa 2.5, cc by-sa 3.0 and cc by-sa 4.0

0 0 votes
Article Rating
Subscribe
Notify of
guest

0 Comments
Inline Feedbacks
View all comments
0
Would love your thoughts, please comment.x
()
x