Javascript Map heap out of memory - web scraping requests to thousands of pages using Cheerio

I am currently working on a web scraper to scrape assessor's data for houses in my local state. Things were going swimmingly, but unfortunately, the data is extremely nested and thus I have to do a lot of work to actually get the proper links for the web page I needed.

The data is structured as follows:

  • There are 81 towns, resulting in an array of links for each town.
  • There are roughly 20-30 letters/symbols that each street begins with (as streets are paginated by their first letter on this website), so I need to get the street links for each town (Putting us at roughly 2,500 pages)
  • There are roughly 20 streets per letter, and I need to scrape those pages in order to get the houses on each street (45,000 pages)
  • There are on average 16 houses per street, and I need those URLs (750,000) and then go to each URL and scape the content from that house.

Currently, my code gets all of the links for each street, and I am now trying to get an array of links to each house (the array would be roughly 750,000 in size).

This means I need to make 45,000 GET requests in order to scrape this data. This is what my code looks like here.

    const cheerio = require('cheerio');
    const axios = require('axios');
    const PromisePool = require('@supercharge/promise-pool');
    
    Object.defineProperty(Array.prototype, 'flat', {
        value: function (depth = 1) {
            return this.reduce(function (flat, toFlatten) {
                return flat.concat((Array.isArray(toFlatten) && (depth > 1)) ? toFlatten.flat(depth - 1) : toFlatten);
            }, []);
        }
    });
    
    (async function () {
        try {
            const base = 'https://www.vgsi.com/massachusetts-online-database/';
            const {data} = await axios.get(base);
            const $ = cheerio.load(data);
    
            const townLinks = $('.bluelink').map((i, elem) => {
                const link = elem.attribs.href;
                const townLink = link.endsWith('/') ? link.slice(0, -1) : `${link}`;
                const streetsSuffix = 'Streets.aspx';
                return {
                    townLink: `${townLink}/${streetsSuffix}`,
                    baseLink: townLink
                };
            }).get();
    
            let letterLinks = await Promise.all(townLinks.map(async ({townLink, baseLink}) => {
                try {
                    const {data} = await axios.get(townLink);
                    const $ = cheerio.load(data);
                    const links = $('div.buttonMe a').map((i, a) => {
                        return {
                            letterLink: `${baseLink}/${$(a).attr('href')}`,
                            baseLink: baseLink
                        }
                    }).get();
                    return links;
                } catch (e) {
                    numOfErrors++;
                    console.log("letter link error", townLink, e);
                }
            }));
    
            letterLinks = letterLinks.filter(link => link).flat();
    
            console.log(letterLinks);
    
            let streetLinks = await Promise.all(letterLinks.map(async ({baseLink, letterLink}) => {
                try {
                    const {data} = await axios.get(letterLink);
                    const $ = cheerio.load(data);
                    const links = $('li.fixedButton a').map((i, a) => {
                        return {
                            streetLink: `${baseLink}/${$(a).attr('href')}`,
                            baseLink
                        };
                    }).get();
                    return links;
                } catch (e) {
                    console.log('street link error', letterLink, e)
                }
            }));
    
            streetLinks = streetLinks.filter(link => link).flat();
    
            const {results, errors} = await PromisePool
                .for(streetLinks)
                .withConcurrency(500)
                .handleError(async (error, user) => {
                    console.log("Housing links error", error);
                })
                .process(async ({baseLink, streetLink}) => {
                    const {data} = await axios.get(streetLink);
                    const $ = cheerio.load(data);
                    const links = $('#list > li > a').map((i, a) => {
                        return {
                            houseLink: `${baseLink}/${$(a).attr('href')}`,
                            baseLink
                        };
                    });
    
                    return links;
                });
            console.log(results);
            console.log(errors);
    
        } catch (e) {
            console.log('Bad Error', e);
        }
    })();

When I try to run this however, I get a Javascript Heap out of Memory error.

    <--- Last few GCs --->
    
    [51190:0x1049d1000]   618286 ms: Scavenge (reduce) 8083.5 (8238.8) -> 8083.2 (8238.8) MB, 9.5 / 0.0 ms  (average mu = 0.467, current mu = 0.351) allocation failure 
    [51190:0x1049d1000]   618306 ms: Scavenge (reduce) 8083.8 (8238.8) -> 8083.4 (8238.8) MB, 9.9 / 0.0 ms  (average mu = 0.467, current mu = 0.351) allocation failure 
    [51190:0x1049d1000]   618323 ms: Scavenge (reduce) 8084.1 (8238.8) -> 8083.9 (8239.1) MB, 10.4 / 0.0 ms  (average mu = 0.467, current mu = 0.351) allocation failure 
    
    
    <--- JS stacktrace --->
    
    FATAL ERROR: MarkCompactCollector: young object promotion failed Allocation failed - JavaScript heap out of memory
     1: 0x1013611f5 node::Abort() (.cold.1) [/Users/mitchell/.nvm/versions/node/v15.2.1/bin/node]
     2: 0x1000be519 node::Abort() [/Users/mitchell/.nvm/versions/node/v15.2.1/bin/node]
     3: 0x1000be67f node::OnFatalError(char const*, char const*) [/Users/mitchell/.nvm/versions/node/v15.2.1/bin/node]
     4: 0x10022f6a7 v8::Utils::ReportOOMFailure(v8::internal::Isolate*, char const*, bool) [/Users/mitchell/.nvm/versions/node/v15.2.1/bin/node]
     5: 0x10022f643 v8::internal::V8::FatalProcessOutOfMemory(v8::internal::Isolate*, char const*, bool) [/Users/mitchell/.nvm/versions/node/v15.2.1/bin/node]
     6: 0x1003e9a95 v8::internal::Heap::FatalProcessOutOfMemory(char const*) [/Users/mitchell/.nvm/versions/node/v15.2.1/bin/node]
     7: 0x1004470f4 v8::internal::EvacuateNewSpaceVisitor::Visit(v8::internal::HeapObject, int) [/Users/mitchell/.nvm/versions/node/v15.2.1/bin/node]
     8: 0x10042e76b void v8::internal::LiveObjectVisitor::VisitBlackObjectsNoFail<v8::internal::EvacuateNewSpaceVisitor, v8::internal::MajorNonAtomicMarkingState>(v8::internal::MemoryChunk*, v8::internal::MajorNonAtomicMarkingState*, v8::internal::EvacuateNewSpaceVisitor*, v8::internal::LiveObjectVisitor::IterationMode) [/Users/mitchell/.nvm/versions/node/v15.2.1/bin/node]
     9: 0x10042e296 v8::internal::FullEvacuator::RawEvacuatePage(v8::internal::MemoryChunk*, long*) [/Users/mitchell/.nvm/versions/node/v15.2.1/bin/node]
    10: 0x10042df96 v8::internal::Evacuator::EvacuatePage(v8::internal::MemoryChunk*) [/Users/mitchell/.nvm/versions/node/v15.2.1/bin/node]
    11: 0x10044be2e v8::internal::PageEvacuationTask::RunInParallel(v8::internal::ItemParallelJob::Task::Runner) [/Users/mitchell/.nvm/versions/node/v15.2.1/bin/node]
    12: 0x100403b82 v8::internal::ItemParallelJob::Task::RunInternal() [/Users/mitchell/.nvm/versions/node/v15.2.1/bin/node]
    13: 0x100404008 v8::internal::ItemParallelJob::Run() [/Users/mitchell/.nvm/versions/node/v15.2.1/bin/node]
    14: 0x1004300a5 void v8::internal::MarkCompactCollectorBase::CreateAndExecuteEvacuationTasks<v8::internal::FullEvacuator, v8::internal::MarkCompactCollector>(v8::internal::MarkCompactCollector*, v8::internal::ItemParallelJob*, v8::internal::MigrationObserver*, long) [/Users/mitchell/.nvm/versions/node/v15.2.1/bin/node]
    15: 0x10042fc67 v8::internal::MarkCompactCollector::EvacuatePagesInParallel() [/Users/mitchell/.nvm/versions/node/v15.2.1/bin/node]
    16: 0x10041aa07 v8::internal::MarkCompactCollector::Evacuate() [/Users/mitchell/.nvm/versions/node/v15.2.1/bin/node]
    17: 0x1004182cb v8::internal::MarkCompactCollector::CollectGarbage() [/Users/mitchell/.nvm/versions/node/v15.2.1/bin/node]

Right now, I am trying to make a request to 45,000 links and create an array of the 750,000 sublinks on each of those pages. Then, I need to go to the 750,000 links and scrape the data from those websites and save it to a database.

How can I do this without going over the heap memory? Obviously this is a lot of data to work with, so I am trying to use the PromisePool to prevent too many jobs happening at once. Even when I make the concurrency 2 vs 500, I still have the same error. I increased the memory by running node --max-old-space-size=8192 scraper.js but it still is too much for the job to handle.