top | item 14380858

(no title)

gwu78 | 8 years ago

This is my "Hacker News Reader". It converts HN to csv. (Only selected fields of interest to me.) From there it can easily be imported into kdb+. I have more reusable generalized lex techniques for other websites but HN is so simple it can be done via a braindead one-off as below.

Requirements: lex, cc

Usage:

   fetch -4o yc.htm https://news.ycombinator.com
   yc < yc.htm 
To compile this I use something like

    flex -Crfa -8 -i yc.l;
    cc -Wall -pipe lex.yy.c -static -o yc;
Save the text below as yc.l then compile as above.

    #define jmp BEGIN
    #define p printf
    #define x yytext
   %s aa bb cc dd ee ff gg hh
   %s ii jj kk ll mm nn oo 
   aa "span class=\"rank\""
   bb "a href=\""
   cc score_........
   dd \>
    /* #include <time.h> */
    /* #include <util.h> */
   %%
   [^\12\40-\176]
   , p("%2c");
    /* rank (dont care) */
   {aa} jmp aa;
    /* <aa>[1-9][^<\.]* p("\n%s,",x);jmp bb; */
   <aa>[1-9][^<\.]* p("\n");jmp bb; 
    /* url */
   <bb>{bb} jmp cc;
   <cc>http[^"]* p("%s,",x);jmp dd; 
    /* title */
   <dd>{dd} jmp ee;
    /* <ee>[^><]* p("%s,",x);jmp ff; */
   <ee>[^><]* p("%s",x);jmp ff;
    /* host (omit) */
    /* points (dont care) */
   <ff>{cc} jmp gg;
   <gg>{dd} jmp hh;
    /* <hh>[1-9][^<> p]* p("%s,",x);jmp ii; */
   <hh>[1-9][^<> p]* p(",");jmp ii; 
    /* user */
   <ii>{bb} jmp jj;
   <jj>http[^"]* p("%s,",x);jmp kk;
    /* time (dont care) */
   <kk>{bb} jmp ll;
    /* <ll>http[^"]* ; */
   <ll>http[^"]* jmp mm;
    /* unix time (dont care) */
    /* <ll>[1-9][^<]* { 
    time_t t0; time_t t1; time_t t2;
    t1=time(&t0);
    t2=parsedate(x,&t1,0);
    p("%d,",t2); 
    jmp mm;
    } */
    /* item */
   <mm>{bb} jmp nn;
    /* <nn>http[^"]* p("%s,",x);jmp oo; */
   <nn>http[^"]* p("%s",x);jmp oo;
    /* comments (dont care) */
   <oo>{dd} jmp oo;
    /* <oo>[1-9d][^ <]* p("%s",x);jmp 0; */
   <oo>[1-9d][^ <]* jmp 0;
   .
   \n
   %%
   main(){ yylex();}
   yywrap(){ p("\n");}

discuss

order

deno|8 years ago

And I thought using regex for parsing HTML was bad.