Scrapping Data from BPS Website using Java

BPS (Badan Pusat Statistik) is a non-departmental government institute of Indonesia that is responsible for conducting statistical surveys. On its website, we can see that there are a lot of data available, especially regarding to spatial and regional such as number of provinces, zipcodes, cities, and others.

One BPS website which is contains interesting data is https://sig.bps.go.id/, an on this sample im trying to crawl and read the zipcode data from it.

Lets start with pom file,

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.edw</groupId>
    <artifactId>ScrappingBPS</artifactId>
    <version>1.0</version>

    <properties>
        <java.version>11</java.version>
        <maven.compiler.source>11</maven.compiler.source>
        <maven.compiler.target>11</maven.compiler.target>
    </properties>

    <dependencies>
        <dependency>
            <groupId>com.squareup.okhttp3</groupId>
            <artifactId>okhttp</artifactId>
            <version>4.9.3</version>
        </dependency>
        <dependency>
            <groupId>org.mybatis</groupId>
            <artifactId>mybatis</artifactId>
            <version>3.5.5</version>
        </dependency>
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>8.0.25</version>
        </dependency>
        <dependency>
            <groupId>com.fasterxml.jackson.core</groupId>
            <artifactId>jackson-databind</artifactId>
            <version>2.13.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.logging.log4j</groupId>
            <artifactId>log4j-api</artifactId>
            <version>2.15.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.logging.log4j</groupId>
            <artifactId>log4j-core</artifactId>
            <version>2.15.0</version>
        </dependency>

        <dependency>
            <groupId>com.lmax</groupId>
            <artifactId>disruptor</artifactId>
            <version>3.4.4</version>
        </dependency>

    </dependencies>
</project>

Since im using MyBatis, so lets create a SessionFactory class.

package com.edw.config;

import com.edw.mapper.KodeposMapper;
import com.edw.mapper.RegionsMapper;
import org.apache.ibatis.io.Resources;
import org.apache.ibatis.session.SqlSessionFactory;
import org.apache.ibatis.session.SqlSessionFactoryBuilder;

import java.io.IOException;
import java.io.Reader;

public class MyBatisSqlSessionFactory {
    private static final SqlSessionFactory FACTORY;

    static {
        try {
            Reader reader = Resources.getResourceAsReader("configuration.xml");
            FACTORY = new SqlSessionFactoryBuilder().build(reader);
            FACTORY.getConfiguration().addMapper(KodeposMapper.class);
            
        } catch (IOException e) {
            throw new RuntimeException("Fatal Error. Cause: " + e, e);
        }
    }

    public static SqlSessionFactory getSqlSessionFactory() {
        return FACTORY;
    }
}

And a bean for table representation, and a mapper interface.

package com.edw.bean;

import java.io.Serializable;

public class Kodepos implements Serializable {
    private Long id;
    private String kelurahan;
    private String kecamatan;
    private String kabupaten;
    private String provinsi;
    private String kodepos;

    public Kodepos() {
    }

    // other setter and getter
}
package com.edw.mapper;

import com.edw.bean.Kodepos;
import org.apache.ibatis.annotations.Insert;

public interface KodeposMapper {

    @Insert("INSERT INTO `db_kodepos`.`tbl_kodepos` " +
            "(`kelurahan`, `kecamatan`, `kabupaten`, `provinsi`, `kodepos`) " +
            "VALUES (#{kelurahan}, #{kecamatan}, #{kabupaten}, #{provinsi}, #{kodepos})")
    Integer insert(Kodepos kodepos);
}

And last is our main class, im using okhttp to do http request and jackson for parsing json to java object

package com.edw;

import com.edw.bean.Kodepos;
import com.edw.config.MyBatisSqlSessionFactory;
import com.edw.mapper.KodeposMapper;
import com.fasterxml.jackson.databind.ObjectMapper;
import okhttp3.Call;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;
import org.apache.ibatis.session.SqlSession;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.io.IOException;
import java.util.*;
import java.util.concurrent.TimeUnit;

public class KodeposScrapping {

    private SqlSession sqlSession = null;
    private KodeposMapper kodeposMapper  = null;

    private static final String BASE_URL = "https://sig.bps.go.id";

    private Logger logger = LogManager.getLogger(KodeposScrapping.class);

    public KodeposScrapping() {
    }

    public static void main(String[] args) throws IOException {
        KodeposScrapping kodeposScrapping = new KodeposScrapping();
        kodeposScrapping.doScrapping();
    }

    private void doScrapping() throws IOException {
        getProvinsis();
    }

    /**
     *  <pre> curl "https://sig.bps.go.id/rest-drop-down/getwilayah" </pre>
     */
    private void getProvinsis() throws IOException {
        Request request = new Request.Builder()
                .url(BASE_URL + "/rest-drop-down/getwilayah")
                .get()
                .build();

        List<HashMap> hashMaps = doHttpCall(request);

        for (HashMap hashMap : hashMaps) {
            Kodepos kodepos = new Kodepos();
            kodepos.setProvinsi(hashMap.get("nama").toString());

            logger.info("start processing {}", hashMap.get("nama").toString());

            // get Kabupatens from Provinsi
            getKabupatens(hashMap.get("kode").toString(), kodepos);

            logger.info("done processing {}", hashMap.get("nama").toString());
        }
    }

    /**
     *  <pre> curl "https://sig.bps.go.id/rest-drop-down/getwilayah?level=kabupaten&parent=11" </pre>
     */
    private void getKabupatens(String parent, Kodepos kodepos) throws IOException {
        Request request = new Request.Builder()
                .url(BASE_URL + "/rest-drop-down/getwilayah?level=kabupaten&parent="+parent)
                .get()
                .build();

        List<HashMap> hashMaps = doHttpCall(request);

        for (HashMap hashMap : hashMaps) {
            kodepos.setKabupaten(hashMap.get("nama").toString());

            // get Kecamatans from Kabupaten
            getKecamatans(hashMap.get("kode").toString(), kodepos);
        }
    }

    /**
     *  <pre> curl "https://sig.bps.go.id/rest-drop-down/getwilayah?level=kecamatan&parent=1101" </pre>
     */
    private void getKecamatans(String parent, Kodepos kodepos) throws IOException {
        Request request = new Request.Builder()
                .url(BASE_URL + "/rest-drop-down/getwilayah?level=kecamatan&parent="+parent)
                .get()
                .build();

        List<HashMap> hashMaps = Collections.synchronizedList(doHttpCall(request));

        for (HashMap hashMap : hashMaps) {
            kodepos.setKecamatan(hashMap.get("nama").toString());
            getKelurahans(hashMap.get("kode").toString(), kodepos);
        }
    }

    /**
     *  <pre> curl "https://sig.bps.go.id/rest-bridging-pos/getwilayah?level=desa&parent=1101050" </pre>
     */
    private void getKelurahans(String parent, Kodepos kodepos) throws IOException {
        Request request = new Request.Builder()
                .url(BASE_URL + "/rest-bridging-pos/getwilayah?level=desa&parent="+parent)
                .get()
                .build();

        List<HashMap> hashMaps = Collections.synchronizedList(doHttpCall(request));

        for (final HashMap hashMap : hashMaps) {
            kodepos.setKelurahan(hashMap.get("nama_bps").toString());
            kodepos.setKodepos(hashMap.get("kode_pos").toString());

            insert(kodepos);
        }
    }

    private void insert(Kodepos kodepos) {
        try {
            sqlSession = MyBatisSqlSessionFactory.getSqlSessionFactory().openSession(true);
            kodeposMapper = sqlSession.getMapper(KodeposMapper.class);

            kodeposMapper.insert(kodepos);
        } catch (Exception e) {
            logger.error(e.getMessage(), e);
        } finally {
            if (sqlSession != null) {
                sqlSession.close();
            }
        }
    }

    private List<HashMap> doHttpCall(Request request) throws IOException {
        Call call = new OkHttpClient().newBuilder()
                .retryOnConnectionFailure(true)
                .connectTimeout(300, TimeUnit.SECONDS)
                .readTimeout(300, TimeUnit.SECONDS)
                .writeTimeout(300, TimeUnit.SECONDS).build().newCall(request);
        Response response = call.execute();

        ObjectMapper objectMapper = new ObjectMapper();
        List<HashMap> hashMaps = objectMapper.readValue(response.body().string(), List.class);

        response.close();
        return hashMaps;
    }
}

Code can be found in my github repository,

https://github.com/edwin/bps-data-scrapper

No Comments

Leave a Comment

Please be polite. We appreciate that.
Your email address will not be published and required fields are marked