Difference between revisions of "Workdocumentation 2021-05-03"
Jump to navigation
Jump to search
(Created page with "= Question = What happens if the relevance matrix approach is applied to proceedings title parsing (later: parsing in general)? = Assumption = Following a hierarchy of letter,...") |
|||
| Line 11: | Line 11: | ||
Input: Proceedings titles of dblp conference entries. | Input: Proceedings titles of dblp conference entries. | ||
| + | |||
| + | == Letter == | ||
| + | <source lang='python'> | ||
| + | def testMostCommonFirstLetter(self): | ||
| + | ''' | ||
| + | get the most common first letters | ||
| + | ''' | ||
| + | dblp,foundEvents=self.getEvents() | ||
| + | self.assertTrue(foundEvents>43950) | ||
| + | # collect first letters | ||
| + | counter=Counter() | ||
| + | for eventId in dblp.em.events: | ||
| + | if eventId.startswith("conf"): | ||
| + | event=dblp.em.events[eventId] | ||
| + | first=ord(event.title[0]) | ||
| + | counter[first]+=1 | ||
| + | bins=len(counter.keys()) | ||
| + | print(f" {bins} different first letters found") | ||
| + | for o,count in counter.most_common(bins): | ||
| + | c=chr(o) | ||
| + | print (f"{c}: {count}") | ||
| + | </source> | ||
| + | <pre> | ||
| + | 46 different first letters found | ||
| + | P: 12599 | ||
| + | 2: 3526 | ||
| + | I: 3515 | ||
| + | A: 3296 | ||
| + | C: 2333 | ||
| + | S: 2260 | ||
| + | 1: 2105 | ||
| + | T: 1559 | ||
| + | M: 1312 | ||
| + | E: 1252 | ||
| + | F: 1246 | ||
| + | D: 1177 | ||
| + | R: 624 | ||
| + | H: 578 | ||
| + | N: 566 | ||
| + | 3: 564 | ||
| + | W: 522 | ||
| + | L: 502 | ||
| + | G: 501 | ||
| + | B: 479 | ||
| + | 4: 354 | ||
| + | V: 334 | ||
| + | K: 257 | ||
| + | O: 255 | ||
| + | 5: 252 | ||
| + | U: 236 | ||
| + | 9: 215 | ||
| + | 6: 211 | ||
| + | 7: 199 | ||
| + | 8: 187 | ||
| + | J: 150 | ||
| + | X: 88 | ||
| + | Q: 76 | ||
| + | e: 19 | ||
| + | Z: 13 | ||
| + | i: 12 | ||
| + | p: 7 | ||
| + | «: 5 | ||
| + | (: 3 | ||
| + | ": 2 | ||
| + | d: 2 | ||
| + | f: 1 | ||
| + | t: 1 | ||
| + | s: 1 | ||
| + | ': 1 | ||
| + | Y: 1 | ||
| + | ---------------------------------------------------------------------- | ||
| + | Ran 1 test in 0.577s | ||
| + | </pre> | ||
Revision as of 07:27, 3 May 2021
Question
What happens if the relevance matrix approach is applied to proceedings title parsing (later: parsing in general)?
Assumption
Following a hierarchy of letter, token, grammatical structure and sentence along the relevance matrix path column first (depth first) leads to interesting observations.
Experiment
Hierarchy of: - Letter - Token - Grammatical structure - Sentence
Input: Proceedings titles of dblp conference entries.
Letter
def testMostCommonFirstLetter(self):
'''
get the most common first letters
'''
dblp,foundEvents=self.getEvents()
self.assertTrue(foundEvents>43950)
# collect first letters
counter=Counter()
for eventId in dblp.em.events:
if eventId.startswith("conf"):
event=dblp.em.events[eventId]
first=ord(event.title[0])
counter[first]+=1
bins=len(counter.keys())
print(f" {bins} different first letters found")
for o,count in counter.most_common(bins):
c=chr(o)
print (f"{c}: {count}")
46 different first letters found P: 12599 2: 3526 I: 3515 A: 3296 C: 2333 S: 2260 1: 2105 T: 1559 M: 1312 E: 1252 F: 1246 D: 1177 R: 624 H: 578 N: 566 3: 564 W: 522 L: 502 G: 501 B: 479 4: 354 V: 334 K: 257 O: 255 5: 252 U: 236 9: 215 6: 211 7: 199 8: 187 J: 150 X: 88 Q: 76 e: 19 Z: 13 i: 12 p: 7 «: 5 (: 3 ": 2 d: 2 f: 1 t: 1 s: 1 ': 1 Y: 1 ---------------------------------------------------------------------- Ran 1 test in 0.577s