spring 整合 mybatis+webmagic爬取数据并持久化

因为最近在爬数据进行分析,数据已经爬好了,但最后还是需要持久化到数据库。因为公司用的持久化框架是mybatis,这里面又不需要mvc的架构,所以只需要spring 和 mybatis进行整合就行了,spring 作为bean容器,mybatis负责orm映射和持久化。

我这边用的是gradle构建工具,下面是我的依赖:

    compile 'us.codecraft:webmagic-core:0.5.3'
    compile('us.codecraft:webmagic-extension:0.5.3')
    compile 'org.seleniumhq.selenium:selenium-java:2.8.0'
    compile group: 'us.codecraft', name: 'webmagic-selenium', version: '0.5.2'
    compile 'com.github.detro:phantomjsdriver:1.2.0'
    testCompile group: 'junit', name: 'junit', version: '4.11'
    compile group: 'com.fasterxml.jackson.core', name: 'jackson-databind', version: '2.8.5'
    compile 'org.springframework:spring-aop:4.2.4.RELEA SE'
    compile 'org.springframework:spring-context:4.2.4.RELEASE'
    compile 'org.springframework:spring-beans:4.2.4.RELEASE'
    compile 'org.springframework:spring-web:4.2.4.RELEASE'
    compile 'org.springframework:spring-webmvc:4.2.4.RELEASE'
    compile 'org.springframework:spring-tx:4.2.4.RELEASE'
    compile 'org.springframework:spring-jdbc:4.2.4.RELEASE'
    compile 'org.springframework:spring-test:4.2.4.RELEASE'
    compile 'mysql:mysql-connector-java:5.1.38'
    compile 'org.mybatis.generator:mybatis-generator-core:1.3.2'
    compile 'org.mybatis:mybatis-spring:1.2.3'
    compile 'org.mybatis:mybatis:3.3.0'
    compile group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.6.2'
    compile group: 'org.apache.commons', name: 'commons-dbcp2', version: '2.1.1'
    compile group: 'org.projectlombok', name: 'lombok', version: '1.16.10'

接下来是实体类PO(基金):

@Data
@Builder
public class Fund{
    private int id;
    private String fundCode;
    private String fundName;
    private String dailyGrowthRate;
    private String monthlyGrowthRate;
}

数据库的schema如下:

CREATE TABLE `fund` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `fund_code` varchar(255) DEFAULT NULL,
  `fund_name` varchar(255) DEFAULT NULL,
  `daily_growth_rate` varchar(255) DEFAULT NULL,
  `monthly_growth_rate` varchar(255) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=11594 DEFAULT CHARSET=utf8;


然后是UserMapper:

public interface FundMapper {
    int insert(Fund fund);
}

然后是业务类UserService:
其中@Service注解配合ComponentScan会把这个类注入Spring容器
@Autowired 是按照类型进行装配

@Service
public class FundService {

    @Autowired
    private FundMapper mapper;

    public int insert(Fund fund){
        return mapper.insert(fund);
    }

}

接下来是UserMapper.xml文件:

<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd" >

<mapper namespace="stock.mapper.FundMapper">

    <resultMap id="BaseResultMap" type="stock.po.Fund">
        <id column="id" property="id" jdbcType="INTEGER"/>
        <result column="fund_code" property="fundCode" jdbcType="VARCHAR"/>
        <result column="fund_name" property="fundName" jdbcType="VARCHAR"/>
        <result column="daily_growth_rate" property="dailyGrowthRate" jdbcType="VARCHAR"/>
        <result column="monthly_growth_rate" property="monthlyGrowthRate" jdbcType="VARCHAR"/>
    </resultMap>

    <sql id="BaseColumnList">
        id,fund_code,fund_name,daily_growth_rate,monthly_growth_rate
    </sql>

    <insert id="insert" parameterType="stock.po.Fund">
        INSERT INTO fund(
        <include refid="BaseColumnList"/>
        )
        VALUES (
        #{id,jdbcType=INTEGER},
        #{fundCode,jdbcType=VARCHAR},
        #{fundName,jdbcType=VARCHAR},
        #{dailyGrowthRate,jdbcType=VARCHAR},
        #{monthlyGrowthRate,jdbcType=VARCHAR}
        )
    </insert>

</mapper>

接着是mybatis的配置文件:

<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE configuration
        PUBLIC "-//mybatis.org//DTD Config 3.0//EN"
        "http://mybatis.org/dtd/mybatis-3-config.dtd">
<configuration>
    <mappers>
        <mapper resource="mapper/FundMapper.xml"/>
    </mappers>

</configuration>

然后就是spring的配置文件:

<?xml version="1.0" encoding="UTF-8"?>
<beans xmlns="http://www.springframework.org/schema/beans"
       xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:p="http://www.springframework.org/schema/p"
       xmlns:context="http://www.springframework.org/schema/context"
       xmlns:mvc="http://www.springframework.org/schema/mvc"
       xsi:schemaLocation="http://www.springframework.org/schema/beans
                        http://www.springframework.org/schema/beans/spring-beans-4.0.xsd
                        http://www.springframework.org/schema/context
                        http://www.springframework.org/schema/context/spring-context-4.0.xsd
                        http://www.springframework.org/schema/mvc
                        http://www.springframework.org/schema/mvc/spring-mvc-4.0.xsd">

    <!-- 加载配置文件 -->
    <context:property-placeholder location="classpath:jdbc.properties"/>

    <context:component-scan base-package="stock.**"/>

    <!-- 数据源,使用dbcp -->
    <bean id="dataSource" class="org.apache.commons.dbcp2.BasicDataSource" destroy-method="close">
        <property name="driverClassName" value="${jdbc.driver}" /><!-- 这里的name不能直接使用driver,必须是driverClassName -->
        <property name="url" value="${jdbc.url}" />
        <property name="username" value="${jdbc.username}" />
        <property name="password" value="${jdbc.password}" />
    </bean>

    <!-- sqlSessionFactory -->
    <bean id = "sqlSessionFactory" class="org.mybatis.spring.SqlSessionFactoryBean">
        <!-- 加载mybatis的配置文件 -->
        <property name="configLocation" value="mybatis-config.xml"></property>
        <!-- 数据源 -->
        <property name="dataSource" ref="dataSource"></property>
    </bean>

    <!-- mapper配置,MapperFactoryBean可以根据mapper接口来生成代理对象 -->
    <bean id="fundMapper" class="org.mybatis.spring.mapper.MapperFactoryBean">
        <property name="mapperInterface" value="stock.mapper.FundMapper"/>
        <property name="sqlSessionFactory" ref="sqlSessionFactory"/>
    </bean>


</beans>

其中jdbc.properties的文件如下:

jdbc.driver=com.mysql.jdbc.Driver
jdbc.url=jdbc:mysql://127.0.0.1:3306/test?useUnicode=true&characterEncoding=utf8
jdbc.username=root
jdbc.password=

然后逻辑代码如下:

public class NewFundProcessor implements PageProcessor {

    private Logger log = LoggerFactory.getLogger(NewFundProcessor.class);

    private ApplicationContext context;

    public NewFundProcessor() {
        context = new ClassPathXmlApplicationContext("classpath:applicationContext.xml");
    }

    private FundService fundService;

    private static final String prefix = "https://e.lufunds.com/jijin/allFund?subType=&haitongGrade=&fundGroupId=&currentPage=";
    private static final String suffix = "&orderType=twelve_month_increase_desc&canFixInvest=&searchWord=#sortTab";
    private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(3000)
            .setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36");


    @Override
    public void process(Page page) {
        System.out.println("first ------------------");
        List<String> list = page.getHtml().xpath("div[@class='listTable']/table[@id='fundTable']/tbody/tr").all();

        for (int i = 0; i < list.size(); i++) {

            Html h = new Html(list.get(i).replace("td", "div"));
            String fundCode = h.xpath("//div[1]/text()").get();
            String fundName = h.xpath("//div[2]/a/text()").get();
            String dailyGrowthRate = h.xpath("//div[4]/span/text()").get();
            String monthGrowthRate = h.xpath("//div[5]/span/text()").get();
            String startAmount = h.xpath("//div[10]/text()").get();
            System.out.println("基金代码:" + h.xpath("//div[1]/text()"));
            System.out.println("基金简介:" + h.xpath("//div[2]/a/text()"));
            System.out.println("最新净值:" + h.xpath("//div[3]/p[1]/text()"));
            System.out.println("时间:" + h.xpath("//p[2]/text()"));
            System.out.println("日增长率:" + h.xpath("//div[4]/span/text()"));
            System.out.println("最近一月增长率:" + h.xpath("//div[5]/span/text()"));
            System.out.println("最近三月增长率:" + h.xpath("//div[6]/span/text()"));
            System.out.println("最近一年增长率:" + h.xpath("//div[7]/span/text()"));
            System.out.println("今年增长率:" + h.xpath("//div[8]/span/text()"));
            System.out.println("成立以来增长率:" + h.xpath("//div[9]/span/text()"));
            System.out.println("起投金额:" + h.xpath("//div[10]/text()"));
            fundService = (FundService) context.getBean("fundService");
            Fund fund = new Fund();
            fund.setFundCode(fundCode);
            fund.setFundName(fundName);
            fund.setDailyGrowthRate(dailyGrowthRate);
            fund.setMonthlyGrowthRate(monthGrowthRate);
            int result = fundService.insert(fund);
            System.out.println(result);

            System.out.println("-------");

        }

        System.out.println("size:" + list.size());


    }

    @Override
    public Site getSite() {
        return site;
    }

    public static void main(String[] args) {

        List<String> urls = new ArrayList<String>();
        for (int i = 1; i <= 250; i++) {
            String url = prefix+i+suffix;
            urls.add(url);
        }

        NewFundProcessor processor = new NewFundProcessor();
            Spider.create(processor)
                    .startUrls(urls)
                    .thread(10)
                    .runAsync();

    }
}

执行之后会往数据库插入3000多条基金的数据:

7.png
8.png

推荐阅读更多精彩内容