top | item 40486818

(no title)

balnaphone | 1 year ago

I would save my data in CSV format, then use this. Save the code below as chunk.pl (remove leading spaces) and call it as "perl chunk.pl" :

    #!/usr/bin/perl -CSD -w -Mstrict -Mwarnings -MText::CSV
    
    # chunk.pl -- split csv files into chunks
    
    # Usage message and exit if needed
    if (!@ARGV || $ARGV[0] eq '-h') {
        print "Usage: $0 input_csv [chunk_size] [output_filename_format] [separator]\n";
        print "Example: $0 input.csv 500 'input-%08d.csv' ','\n";
        exit;
    }
    
    # Set command-line arguments
    my ($INFILE, $CHUNKSIZE, $FMT, $SEP) = @ARGV;
    $CHUNKSIZE //= 500;
    $FMT //= "data-%08d.csv";
    $SEP //= ",";
    
    # Initialize CSV, file handles, and counters
    my $csv = Text::CSV->new({ binary => 1, auto_diag => 1, sep_char => $SEP, eol => "\n" });
    my ($i, $f, $out) = (0, 1, undef);
    open my $in, "<:encoding(UTF-8)", $INFILE or die "Cannot open $INFILE: $!";
    
    # Main loop
    while (my $row = $csv->getline($in)) {
        if ($i % $CHUNKSIZE == 0) {
            close $out if defined $out;
            open $out, ">:encoding(UTF-8)", sprintf($FMT, $f++) or die "Cannot open output file: $!";
        }
        $csv->print($out, $row) or die "Failed to write row: $!";
        $i++;
    }
    
    # Clean up: close file handles
    close $out if defined $out;
    close $in;

discuss

order

balnaphone|1 year ago

I suppose R might be a better choice...

    library(readr)
    library(dplyr)
    library(purrr)
    data <- read_csv("input.csv")
    chunk_size <- 500
    chunks <- split(data, ceiling(seq_along(1:nrow(data))/chunk_size))
    iwalk(chunks, ~write_csv(.x, sprintf("data-%04d.csv", .y)))