C Language Implementation of Porter’s Algorithm

The Porter stemming calculation (or 'Doorman stemmer') is a procedure for uprooting the average person morphological and inflexional endings from words in English. Its fundamental use is as a component of a term standardization prepare that is normally done when setting up Information Retrieval frameworks. The tenets in the Porter calculation are isolated into five particular stages numbered from 1 to 5. They are connected to the words in the content beginning from stage 1 and proceeding onward to stage 5. Further, they are connected consecutively consistently as charges in a project.

This is the Porter stemming calculation, coded up in ANSI C by the creator. It might be viewed as cononical, in that it takes after the calculation displayed in Porter, 1980, A calculation for postfix stripping, Program, Vol. 14, no. 3, pp 130-137, just varying from it at the focuses stamped –DEPARTURE– underneath.

The calculation as depicted in the paper could be precisely imitated by modifying the purposes of DEPARTURE, yet this is scarcely fundamental, in light of the fact that (a) the purposes of DEPARTURE are certainly upgrades, and (b) no encoding of the Porter stemmer I have seen is anything like as accurate as this form, even with the purposes of DEPARTURE!

You can gather it on Unix with 'gcc - O3 - o stem stem.c' after which "stem" takes a rundown of inputs and sends the stemmed proportionate to stdout.

The calculation as encoded here is especially quick.

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

#include <stdio.h>

#include <string.h>

#define TRUE 1

#define FALSE 0

/* The principle piece of the stemming calculation begins here. b is a support

holding a word to be stemmed. The letters are in b[k0], b[k0+1] ...

finishing at b[k]. Indeed k0 = 0 in this demo program. k is rearranged

downwards as the stemming advances. Zero end is not indeed

utilized as a part of the calculation.

Note that just lower case arrangements are stemmed. Constraining to lower case

should be done before stem(...) is called.

*/

static burn * b;/* cradle for word to be stemmed */

static int k,k0,j;/* j is a general counterbalance into the string */

/* cons(i) is TRUE <=> b[i] is a consonant. */

int cons(int i)

{ switch (b[i])

{ case 'a': case 'e': case 'i': case 'o': case 'u': return FALSE;

case 'y': return (i==k0) ? Genuine : !cons(i-1);

default: return TRUE;

}

}

/* m() measures the quantity of consonant successions in the middle of k0 and j. on the off chance that c is

a consonant succession and v a vowel arrangement, and <..> shows self-assertive

vicinity,

<c><v> gives 0

<c>vc<v> gives 1

<c>vcvc<v> gives 2

<c>vcvcvc<v> gives 3

....

*/

int m()

{ int n = 0;

int i = k0;

while(TRUE)

{ on the off chance that (i > j) return n;

on the off chance that (! cons(i)) break; i++;

}

i++;

while(TRUE)

{ while(TRUE)

{ on the off chance that (i > j) return n;

on the off chance that (cons(i)) break;

i++;

}

i++;

n++;

while(TRUE)

{ on the off chance that (i > j) return n;

on the off chance that (! cons(i)) break;

i++;

}

i++;

}

}

/* vowelinstem() is TRUE <=> k0,...j contains a vowel */

int vowelinstem()

{ int i; for (i = k0; i <= j; i++) if (! cons(i)) return TRUE;

return FALSE;

}

/* doublec(j) is TRUE <=> j,(j-1) contain a twofold consonant. */

int doublec(int j)

{ if (j < k0+1) return FALSE;

on the off chance that (b[j] != b[j-1]) return FALSE;

return cons(j);

}

/* cvc(i) is TRUE <=> i-2,i-1,i has the structure consonant - vowel - consonant

furthermore if the second c is not w,x or y. this is utilized when attempting to

restore an e toward the end of a short word. e.g.

cav(e), lov(e), hop(e), crim(e), however

snow, box, plate.

*/

int cvc(int i)

{ on the off chance that (i < k0+2 || !cons(i) || cons(i-1) || !cons(i-2)) return FALSE;

{ int ch = b[i];

on the off chance that (ch == "w" || ch == "x" || ch == 'y') return FALSE;

}

return TRUE;

}

/* ends(s) is TRUE <=> k0,...k closes with the string s. */

int ends(char * s)

{ int length = s[0];

on the off chance that (s[length] != b[k]) return FALSE;/* little accelerate */

in the event that (length > k-k0+1) return FALSE;

in the event that (memcmp(b+k-length+1,s+1,length) != 0) return FALSE;

j = k-length;

return TRUE;

}

/* setto(s) sets (j+1),...k to the characters in the string s, correcting

k. */

void setto(char * s)

{ int length = s[0];

memmove(b+j+1,s+1,length);

k = j+length;

}

/* r(s) is utilized further down. */

void r(char * s) { if (m() > 0) setto(s); }

/* step1ab() disposes of plurals and - ed or - ing. e.g.

strokes - > touch

horses - > poni

ties - > ti

stroke - > touch

felines - > feline

encourage - > sustain

concurred - > concur

incapacitated - > debilitate

tangling - > mat

mating - > mate

meeting - > meet

processing - > factory

messing - > mess

gatherings - > meet

*/

void step1ab()

{ if (b[k] == 's')

{ if (ends("\04" "sses")) k - = 2; else

in the event that (ends("\03" "ies")) setto("\01" "i"); else

in the event that (b[k-1] != 's') k- - ;

}

in the event that (ends("\03" "eed")) { if (m() > 0) k- - ; } else

in the event that ((ends("\02" "ed") || ends("\03" "ing")) && vowelinstem())

{ k = j;

in the event that (ends("\02" "at")) setto("\03" "ate"); else

in the event that (ends("\02" "bl")) setto("\03" "ble"); else

in the event that (ends("\02" "iz")) setto("\03" "ize"); else

on the off chance that (doublec(k))

{ k- - ;

{ int ch = b[k];

on the off chance that (ch == "l" || ch == "s" || ch == 'z') k++;

}

}

else if (m() == 1 && cvc(k)) setto("\01" "e");

}

}

/* step1c() turns terminal y to i when there is another vowel in the stem. */

void step1c() { if (ends("\01" "y") && vowelinstem()) b[k] = 'i'; }

/* step2() maps twofold suffices to single ones. so - ization ( = - ize in addition to

- ation) maps to - ize and so forth take note of that the string before the addition must give

m() > 0. */

void step2() { switch (b[k-1])

{

case 'an': if (ends("\07" "ational")) { r("\03" "ate"); break; }

on the off chance that (ends("\06" "tional")) { r("\04" "tion"); break; }

break;

case 'c': if (ends("\04" "enci")) { r("\04" "ence"); break; }

on the off chance that (ends("\04" "anci")) { r("\04" "ance"); break; }

break;

case 'e': if (ends("\04" "izer")) { r("\03" "ize"); break; }

break;

case 'l': if (ends("\03" "bli")) { r("\03" "ble"); break; }/*-DEPARTURE-*/

/* To coordinate the distributed calculation, supplant this line with

case 'l': if (ends("\04" "abli")) { r("\04" "capable"); break; } */

on the off chance that (ends("\04" "alli")) { r("\02" "al"); break; }

on the off chance that (ends("\05" "entli")) { r("\03" "ent"); break; }

on the off chance that (ends("\03" "eli")) { r("\01" "e"); break; }

on the off chance that (ends("\05" "ousli")) { r("\03" "ous"); break; }

break;

case 'o': if (ends("\07" "ization")) { r("\03" "ize"); break; }

on the off chance that (ends("\05" "ation")) { r("\03" "ate"); break; }

on the off chance that (ends("\04" "ator")) { r("\03" "ate"); break; }

break;

case 's': if (ends("\05" "alism")) { r("\02" "al"); break; }

on the off chance that (ends("\07" "iveness")) { r("\03" "ive"); break; }

in the event that (ends("\07" "fulness")) { r("\03" "ful"); break; }

in the event that (ends("\07" "ousness")) { r("\03" "ous"); break; }

break;

case 't': if (ends("\05" "aliti")) { r("\02" "al"); break; }

in the event that (ends("\05" "iviti")) { r("\03" "ive"); break; }

in the event that (ends("\06" "biliti")) { r("\03" "ble"); break; }

break;

case 'g': if (ends("\04" "logi")) { r("\03" "log"); break; }/*-DEPARTURE-*/

/* To coordinate the distributed calculation, erase this line */

}

/* step3() manages - ic-, - full, - ness and so forth comparable technique to step2. */

void step3() { switch (b[k])

{

case 'e': if (ends("\05" "icate")) { r("\02" "ic"); break; }

in the event that (ends("\05" "ative")) { r("\00" ""); break; }

in the event that (ends("\05" "alize")) { r("\02" "al"); break; }

break;

case 'i': if (ends("\05" "iciti")) { r("\02" "ic"); break; }

break;

case 'l': if (ends("\04" "ical")) { r("\02" "ic"); break; }

in the event that (ends("\03" "ful")) { r("\00" ""); break; }

break;

case 's': if (ends("\04"

C Language Implementation of Porter’s Algorithm

Next

Newer Post

Previous

Older Post