Commit 20de98eb by 侯昆

关键词初步

1 parent 1b683692
...@@ -2,7 +2,6 @@ package com.dookay.cihai.app; ...@@ -2,7 +2,6 @@ package com.dookay.cihai.app;
import com.dookay.coral.common.core.CoralCommonCoreMarker; import com.dookay.coral.common.core.CoralCommonCoreMarker;
import com.dookay.coral.common.web.CoralCommonWebMarker; import com.dookay.coral.common.web.CoralCommonWebMarker;
import com.dookay.cihai.core.CihaiCoreMarker;
import org.mybatis.spring.annotation.MapperScan; import org.mybatis.spring.annotation.MapperScan;
import org.springframework.boot.SpringApplication; import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication; import org.springframework.boot.autoconfigure.SpringBootApplication;
......
...@@ -35,6 +35,15 @@ ...@@ -35,6 +35,15 @@
<optional>true</optional> <optional>true</optional>
</dependency> </dependency>
<!--开发工具--> <!--开发工具-->
<dependency>
<groupId>com.hankcs</groupId>
<artifactId>hanlp</artifactId>
</dependency>
<dependency>
<groupId>com.baidu.aip</groupId>
<artifactId>java-sdk</artifactId>
<version>3.4.0</version>
</dependency>
</dependencies> </dependencies>
</project> </project>
...@@ -4,11 +4,25 @@ import com.dookay.coral.common.core.CoralCommonCoreMarker; ...@@ -4,11 +4,25 @@ import com.dookay.coral.common.core.CoralCommonCoreMarker;
import org.mybatis.spring.annotation.MapperScan; import org.mybatis.spring.annotation.MapperScan;
import org.springframework.boot.SpringApplication; import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication; import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.cache.annotation.EnableCaching; import org.springframework.boot.builder.SpringApplicationBuilder;
import org.springframework.boot.web.support.SpringBootServletInitializer;
/** /**
* 项目运行入口
*
* @author houkun * @author houkun
*/ */
public interface CihaiCoreMarker { @SpringBootApplication(
// 加载不同模块的配置与待注入的Bean
scanBasePackageClasses = {
CoralCommonCoreMarker.class,
CihaiCoreApplication.class
})
@MapperScan(basePackageClasses = CihaiCoreApplication.class)
public class CihaiCoreApplication {
public static void main(String[] args) {
SpringApplication.run(CihaiCoreApplication.class, args);
}
} }
package com.dookay.cihai.core.aip;
/*****************************************
* *
* @dookay.com Internet make it happen *
* ----------- ----------------------- *
* dddd ddddd Internet make it happen *
* o o o Internet make it happen *
* k k k Internet make it happen *
* a a a Internet make it happen *
* yyyy yyyyy Internet make it happen *
* ----------- ----------------------- *
* @dookay.com Internet make it happen *
* *
****************************************/
import com.alibaba.fastjson.JSON;
import com.baidu.aip.nlp.AipNlp;
import com.dookay.cihai.core.aip.consts.LexerNeConst;
import com.dookay.cihai.core.aip.consts.LexerPosConst;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections4.set.ListOrderedSet;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
/**
* Aip 工具类
*
* @author houkun
* @date 2017/12/6
*/
@Component
@Slf4j
public final class AipUtilBean {
@Autowired
private AipNlp aipNlp;
/**
* 抽取关键词
*
* @param document 文档
* @param size 关键词个数
* @return
* @throws JSONException
*/
public List<String> extractKeyWords(String document, int size) throws JSONException {
JSONObject res = aipNlp.lexer(document);
JSONArray items = res.getJSONArray("items");
log.debug(items.toString(2));
List<LexerItem> lexerItems = JSON.parseArray(items.toString(), LexerItem.class);
Map<String, Set<LexerItem>> itemMap = lexerItems.stream()
.filter(l -> LexerPosConst.inThis(l.getPos())
|| LexerNeConst.inThis(l.getNe()))
.collect(Collectors.groupingBy(
LexerItem::getItem,
Collectors.mapping(l -> l, Collectors.toSet())));
ListOrderedSet<String> words = new ListOrderedSet<>();
// 规则1 优先专有名词
for (Map.Entry<String, Set<LexerItem>> entry : itemMap.entrySet()) {
Set<LexerItem> value = entry.getValue();
boolean isNe = value.stream().anyMatch(l -> LexerNeConst.inThis(l.getNe()));
if (isNe) {
words.add(entry.getKey());
}
}
// 规则2 其他名词按照出现次数
itemMap.entrySet().stream().sorted(Comparator.comparing(e -> e.getValue().size())).forEach(e -> {
words.add(e.getKey());
});
return words.stream().limit(size).collect(Collectors.toList());
}
}
package com.dookay.cihai.core.aip;
/*****************************************
* *
* @dookay.com Internet make it happen *
* ----------- ----------------------- *
* dddd ddddd Internet make it happen *
* o o o Internet make it happen *
* k k k Internet make it happen *
* a a a Internet make it happen *
* yyyy yyyyy Internet make it happen *
* ----------- ----------------------- *
* @dookay.com Internet make it happen *
* *
****************************************/
import lombok.Data;
/**
* 词法分析结果
*
* @author houkun
* @date 2017/12/6
*/
@Data
public class LexerItem {
/**
* 标准化表达
*/
private String formal;
/**
* 词
*/
private String item;
/**
* 词性
*/
private String pos;
/**
* 专词
*/
private String ne;
/**
* 字节级 length
*/
private int byteLength;
/**
* 字节级 offset
*/
private int byteOffset;
}
package com.dookay.cihai.core.aip.config;
/*****************************************
* *
* @dookay.com Internet make it happen *
* ----------- ----------------------- *
* dddd ddddd Internet make it happen *
* o o o Internet make it happen *
* k k k Internet make it happen *
* a a a Internet make it happen *
* yyyy yyyyy Internet make it happen *
* ----------- ----------------------- *
* @dookay.com Internet make it happen *
* *
****************************************/
import com.baidu.aip.nlp.AipNlp;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
/**
* 配置
*
* @author houkun
* @date 2017/12/6
*/
@Configuration
public class AipConfig {
private final AipProperties aipProperties;
@Autowired
public AipConfig(AipProperties aipProperties) {
this.aipProperties = aipProperties;
}
@Bean
public AipNlp aipNlp() {
AipNlp aipNlp = new AipNlp(aipProperties.getAppId(), aipProperties.getApiKey(), aipProperties.getSecretKey());
aipNlp.setConnectionTimeoutInMillis(2000);
aipNlp.setSocketTimeoutInMillis(60000);
return aipNlp;
}
}
package com.dookay.cihai.core.aip.config;
/*****************************************
* *
* @dookay.com Internet make it happen *
* ----------- ----------------------- *
* dddd ddddd Internet make it happen *
* o o o Internet make it happen *
* k k k Internet make it happen *
* a a a Internet make it happen *
* yyyy yyyyy Internet make it happen *
* ----------- ----------------------- *
* @dookay.com Internet make it happen *
* *
****************************************/
import lombok.Data;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.stereotype.Component;
/**
* 配置文件
*
* @author houkun
* @date 2017/12/6
*/
@ConfigurationProperties("aip")
@Data
@Component
public class AipProperties {
private String appId;
private String apiKey;
private String secretKey;
}
package com.dookay.cihai.core.aip.consts;
/*****************************************
* *
* @dookay.com Internet make it happen *
* ----------- ----------------------- *
* dddd ddddd Internet make it happen *
* o o o Internet make it happen *
* k k k Internet make it happen *
* a a a Internet make it happen *
* yyyy yyyyy Internet make it happen *
* ----------- ----------------------- *
* @dookay.com Internet make it happen *
* *
****************************************/
import java.util.Arrays;
/**
* 依存句法分析词性
*
* @author houkun
* @date 2017/12/6
*/
public interface DepPostTagConst {
/**
* 名词
*/
String N = "n";
/**
* 名形词
*/
String AN = "an";
/**
* 成语
*/
String I = "i";
/**
* 简称略语
*/
String J = "j";
/**
* 习用语
*/
String L = "l";
/**
* 名语素
*/
String NG = "Ng";
/**
* 人名
*/
String NR = "nr";
/**
* 地名
*/
String NS = "ns";
/**
* 机构团体
*/
String NT = "nt";
/**
* 其他专名
*/
String NZ = "nz";
/**
* 是否在其中
* @param postTag
* @return
*/
static boolean inThis(String postTag){
return Arrays.asList(N, AN, I, J, L, NG, NR, NS, NT, NZ).contains(postTag);
}
}
package com.dookay.cihai.core.aip.consts;
/*****************************************
* *
* @dookay.com Internet make it happen *
* ----------- ----------------------- *
* dddd ddddd Internet make it happen *
* o o o Internet make it happen *
* k k k Internet make it happen *
* a a a Internet make it happen *
* yyyy yyyyy Internet make it happen *
* ----------- ----------------------- *
* @dookay.com Internet make it happen *
* *
****************************************/
import java.util.Arrays;
/**
* 词法分析专词含义
*
* @author houkun
* @date 2017/12/6
*/
public interface LexerNeConst {
/**
* 人名
*/
String PER = "PER";
/**
* 地名
*/
String LOC = "LOC";
/**
* 机构名
*/
String ORG = "ORG";
/**
* 是否在其中
*
* @param ne
* @return
*/
static boolean inThis(String ne) {
return Arrays.asList(PER, LOC, ORG).contains(ne);
}
}
package com.dookay.cihai.core.aip.consts;
/*****************************************
* *
* @dookay.com Internet make it happen *
* ----------- ----------------------- *
* dddd ddddd Internet make it happen *
* o o o Internet make it happen *
* k k k Internet make it happen *
* a a a Internet make it happen *
* yyyy yyyyy Internet make it happen *
* ----------- ----------------------- *
* @dookay.com Internet make it happen *
* *
****************************************/
import java.util.Arrays;
/**
* 词法分析重要词性
*
* @author houkun
* @date 2017/12/6
*/
public interface LexerPosConst {
/**
* 普通名词
*/
String N = "n";
/**
* 人名
*/
String NR = "nr";
/**
* 其他专名
*/
String NZ = "nz";
/**
* 机构团体名
*/
String NT = "nt";
/**
* 地名
*/
String NS = "ns";
/**
* 是否在重要词性中
*
* @param pos
* @return
*/
static boolean inThis(String pos) {
return Arrays.asList(N, NR, NZ, NT, NS).contains(pos);
}
}
package com.dookay.cihai.core;
/*****************************************
* *
* @dookay.com Internet make it happen *
* ----------- ----------------------- *
* dddd ddddd Internet make it happen *
* o o o Internet make it happen *
* k k k Internet make it happen *
* a a a Internet make it happen *
* yyyy yyyyy Internet make it happen *
* ----------- ----------------------- *
* @dookay.com Internet make it happen *
* *
****************************************/
import com.alibaba.fastjson.JSON;
import com.baidu.aip.nlp.AipNlp;
import com.dookay.cihai.core.aip.AipUtilBean;
import com.dookay.cihai.core.aip.LexerItem;
import com.dookay.cihai.core.aip.consts.LexerNeConst;
import com.dookay.cihai.core.aip.consts.LexerPosConst;
import com.hankcs.hanlp.HanLP;
import org.apache.commons.collections4.set.ListOrderedSet;
import org.json.JSONArray;
import org.json.JSONObject;
import org.junit.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.ClassPathResource;
import org.springframework.core.io.Resource;
import org.springframework.util.FileCopyUtils;
import java.io.File;
import java.io.FileReader;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
/**
* @author houkun
* @date 2017/12/6
*/
public class BaiduTest extends CihaiCoreApplicationTests {
@Autowired
private AipUtilBean aipUtilBean;
@Test
public void test() throws Exception {
String text = "中国共产党第十九次全国代表大会(简称党的十九大)于2017年10月18日至10月24日在北京召开。\n" +
"2017年10月18日上午9:00,中国共产党第十九次全国代表大会在人民大会堂开幕。习近平代表第十八届中央委员会向大会作了题为《决胜全面建成小康社会 夺取新时代中国特色社会主义伟大胜利》的报告。\n" +
"这次大会的主题是:不忘初心,牢记使命,高举中国特色社会主义伟大旗帜,决胜全面建成小康社会,夺取新时代中国特色社会主义伟大胜利,为实现中华民族伟大复兴的中国梦不懈奋斗。\n" +
"党的十九大,是在全面建成小康社会决胜阶段、中国特色社会主义发展关键时期召开的一次十分重要的大会。承担着谋划决胜全面建成小康社会、深入推进社会主义现代化建设的重大任务,事关党和国家事业继往开来,事关中国特色社会主义前途命运,事关最广大人民根本利益。[1] \n" +
"2017年10月24日,中国共产党第十九次全国代表大会在选举产生新一届中央委员会和中央纪律检查委员会,通过关于十八届中央委员会报告的决议、关于十八届中央纪律检查委员会工作报告的决议、关于《中国共产党章程(修正案)》的决议后,在人民大会堂胜利闭幕。[2-3] ";
// Resource resource = new ClassPathResource("text.txt");
// File file = resource.getFile();
// FileReader reader = new FileReader(file);
// String text = FileCopyUtils.copyToString(reader);
List<String> list = aipUtilBean.extractKeyWords(text, 10);
System.out.println(JSON.toJSONString(list));
}
}
...@@ -4,6 +4,7 @@ import org.junit.Test; ...@@ -4,6 +4,7 @@ import org.junit.Test;
import org.junit.runner.RunWith; import org.junit.runner.RunWith;
import org.mybatis.spring.annotation.MapperScan; import org.mybatis.spring.annotation.MapperScan;
import org.springframework.boot.test.context.SpringBootTest; import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.context.annotation.Import;
import org.springframework.test.annotation.Rollback; import org.springframework.test.annotation.Rollback;
import org.springframework.test.context.junit4.SpringRunner; import org.springframework.test.context.junit4.SpringRunner;
import org.springframework.transaction.annotation.Transactional; import org.springframework.transaction.annotation.Transactional;
...@@ -12,7 +13,7 @@ import org.springframework.transaction.annotation.Transactional; ...@@ -12,7 +13,7 @@ import org.springframework.transaction.annotation.Transactional;
@SpringBootTest(webEnvironment = SpringBootTest.WebEnvironment.NONE) @SpringBootTest(webEnvironment = SpringBootTest.WebEnvironment.NONE)
@Rollback @Rollback
@Transactional @Transactional
@MapperScan(basePackageClasses = CihaiCoreMarker.class) @Import(CihaiCoreApplication.class)
public class CihaiCoreApplicationTests { public class CihaiCoreApplicationTests {
@Test @Test
......
package com.dookay.cihai.core;
/*****************************************
* *
* @dookay.com Internet make it happen *
* ----------- ----------------------- *
* dddd ddddd Internet make it happen *
* o o o Internet make it happen *
* k k k Internet make it happen *
* a a a Internet make it happen *
* yyyy yyyyy Internet make it happen *
* ----------- ----------------------- *
* @dookay.com Internet make it happen *
* *
****************************************/
import com.hankcs.hanlp.HanLP;
import lombok.extern.slf4j.Slf4j;
import org.junit.Test;
import org.springframework.core.io.ClassPathResource;
import org.springframework.core.io.Resource;
import org.springframework.util.FileCopyUtils;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.List;
/**
* 分词测试
*
* @author houkun
* @date 2017/12/6
*/
@Slf4j
public class HankcsTest {
@Test
public void test() throws IOException {
Resource resource = new ClassPathResource("text.txt");
File file = resource.getFile();
FileReader reader = new FileReader(file);
String document = FileCopyUtils.copyToString(reader);
List<String> strings = HanLP.extractPhrase(document, 10);
log.info(strings.toString());
}
}
# 数据库配置 # 数据库配置
# 数据库连接 # 数据库连接
spring.datasource.url=jdbc:mysql://localhost:3306/test spring.datasource.url=jdbc:mysql://192.168.2.24:3306/cihai
spring.datasource.username=root spring.datasource.username=root
# 加密后的密码 # 加密后的密码
spring.datasource.password=UvoEsj8TZ9a11DQRR42EGuoIRjeuqSagmUzCdUeDnw9PJ9hum0Lx/MTPrpIR7isrPdA8lMpaY77eW4g3f1tCQQ== spring.datasource.password=Lvv/S6CDCaeRONtvcUDlHrty4nYxnQFAIcT43dJOqCenmj9x/dDzX9i3S9H5AdskVo8KNR5hJAOHw6SmjjlXLg==
# 加密时的公钥 # 加密时的公钥
public-key=MFwwDQYJKoZIhvcNAQEBBQADSwAwSAJBAJFbtR5GoNcW9j8b4RhZQ1CC1xNBx8Sqphxc8/6vNWYdd7d84AUfSAzFXCzGvuvJ0URNAg9IykPDexY/mHP8dA0CAwEAAQ== public-key=MFwwDQYJKoZIhvcNAQEBBQADSwAwSAJBAKIR34/oer1hySHvAOf/1XGzImWZucYu5Kv4sxerDESPP0qdI5HRL+E1S5lH7Hpt1BYOoZhHMgZ6qfqxAC2rpKkCAwEAAQ==
# druid解密配置 # druid解密配置
spring.datasource.druid.connection-properties=config.decrypt=true;config.decrypt.key=${public-key} spring.datasource.druid.connection-properties=config.decrypt=true;config.decrypt.key=${public-key}
spring.datasource.druid.filter.config.enabled=true spring.datasource.druid.filter.config.enabled=true
...@@ -14,3 +14,7 @@ spring.datasource.druid.filter.config.enabled=true ...@@ -14,3 +14,7 @@ spring.datasource.druid.filter.config.enabled=true
# 设置扫描 mapper xml路径 # 设置扫描 mapper xml路径
mapper.mappers=com.dookay.coral.common.core.persistence.Mapper mapper.mappers=com.dookay.coral.common.core.persistence.Mapper
mybatis.mapper-locations=classpath*:mapper/*.xml mybatis.mapper-locations=classpath*:mapper/*.xml
aip.app-id=10486245
aip.api-key=ws8qdxT51xm2qbWufxzRedI3
aip.secret-key=8b6g9ZyR69dFl6aqYdIOGa4IbOGgkdjh
...@@ -2,7 +2,6 @@ package com.dookay.cihai.jsp; ...@@ -2,7 +2,6 @@ package com.dookay.cihai.jsp;
import com.dookay.coral.common.core.CoralCommonCoreMarker; import com.dookay.coral.common.core.CoralCommonCoreMarker;
import com.dookay.coral.common.web.CoralCommonWebMarker; import com.dookay.coral.common.web.CoralCommonWebMarker;
import com.dookay.cihai.core.CihaiCoreMarker;
import org.mybatis.spring.annotation.MapperScan; import org.mybatis.spring.annotation.MapperScan;
import org.springframework.boot.SpringApplication; import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication; import org.springframework.boot.autoconfigure.SpringBootApplication;
......
...@@ -3,9 +3,9 @@ debug=true ...@@ -3,9 +3,9 @@ debug=true
spring.datasource.url=jdbc:mysql://localhost:3306/test spring.datasource.url=jdbc:mysql://localhost:3306/test
spring.datasource.username=root spring.datasource.username=root
# 加密后的密码 # 加密后的密码
spring.datasource.password=UvoEsj8TZ9a11DQRR42EGuoIRjeuqSagmUzCdUeDnw9PJ9hum0Lx/MTPrpIR7isrPdA8lMpaY77eW4g3f1tCQQ== spring.datasource.password=Lvv/S6CDCaeRONtvcUDlHrty4nYxnQFAIcT43dJOqCenmj9x/dDzX9i3S9H5AdskVo8KNR5hJAOHw6SmjjlXLg==
# 加密时的公钥 # 加密时的公钥
public-key=MFwwDQYJKoZIhvcNAQEBBQADSwAwSAJBAJFbtR5GoNcW9j8b4RhZQ1CC1xNBx8Sqphxc8/6vNWYdd7d84AUfSAzFXCzGvuvJ0URNAg9IykPDexY/mHP8dA0CAwEAAQ== public-key=MFwwDQYJKoZIhvcNAQEBBQADSwAwSAJBAKIR34/oer1hySHvAOf/1XGzImWZucYu5Kv4sxerDESPP0qdI5HRL+E1S5lH7Hpt1BYOoZhHMgZ6qfqxAC2rpKkCAwEAAQ==
# 日志 # 日志
logging.level.root=info logging.level.root=info
logging.level.com.dookay.core=trace logging.level.com.dookay.core=trace
......
...@@ -53,6 +53,14 @@ ...@@ -53,6 +53,14 @@
</dependency> </dependency>
<!--项目内依赖--> <!--项目内依赖-->
<!--自然语言处理-->
<dependency>
<groupId>com.hankcs</groupId>
<artifactId>hanlp</artifactId>
<version>portable-1.5.2</version>
</dependency>
<!--自然语言处理-->
</dependencies> </dependencies>
</dependencyManagement> </dependencyManagement>
......
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!